From c2a5bb91448645130c317d94ea18b99a48e65b1a Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Thu, 30 Jun 2022 10:48:44 +0800 Subject: [PATCH 001/250] Add new attr of fused_multi_transformer (#43730) * Add new attr of fused_multi_transformer * fix format * add note * add in layer * fixfixfixfix --- .../fused/fused_multi_transformer_op.cc | 71 +++++++++++++------ .../fused/fused_multi_transformer_op.cu | 16 +++-- .../nn/functional/fused_transformer.py | 9 ++- .../incubate/nn/layer/fused_transformer.py | 9 ++- 4 files changed, 77 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc index aa05ebc43da78..86de140b9cde8 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cc +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" namespace paddle { namespace operators { @@ -63,6 +64,7 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel { // y: qkv's weight: [3, num_head, dim_head, dim_embed] auto x_dim = ctx->GetInputDim("X"); auto y_dim = ctx->GetInputsDim("QKVW")[0]; + bool trans_qkvw = ctx->Attrs().Get("trans_qkvw"); PADDLE_ENFORCE_EQ( x_dim.size(), 3, @@ -79,24 +81,37 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel { "but received dimensions of" "Input is [%d]", y_dim.size())); - PADDLE_ENFORCE_EQ(x_dim[2], - y_dim[3], - platform::errors::InvalidArgument( - "ShapeError: the dimension of x_dim[2] and y_dim[3]" - "must be equal. But received: the shape " - "of input x = [%s], and the shape of " - "input qkv_weight = [%s]", - x_dim, - y_dim)); + PADDLE_ENFORCE_EQ( + x_dim[2], + trans_qkvw ? y_dim[3] : y_dim[0], + platform::errors::InvalidArgument( + "ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is " + "true) or y_dim[0](trans_qkvw is false)" + "must be equal. But received: the shape " + "of input x = [%s], and the shape of " + "input qkv_weight = [%s]", + x_dim, + y_dim)); if (ctx->Attrs().Get("ring_id") == -1) { - PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], - y_dim[3], - platform::errors::InvalidArgument( - "The dimensions of qkv_weight must be 4" - "(3, num_head, dim_head, dim_embed)," - "and must satisfy the limitations: " - "(num_head * dim_head == dim_embed)")); + if (trans_qkvw) { + PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], + y_dim[3], + platform::errors::InvalidArgument( + "The dimensions of qkv_weight must be 4" + "(3, num_head, dim_head, dim_embed)," + "and must satisfy the limitations: " + "(num_head * dim_head == dim_embed)")); + + } else { + PADDLE_ENFORCE_EQ(y_dim[2] * y_dim[3], + y_dim[0], + platform::errors::InvalidArgument( + "The dimensions of qkv_weight must be 4" + "(dim_embed, 3, num_head, dim_head)," + "and must satisfy the limitations: " + "(num_head * dim_head == dim_embed)")); + } } if (ctx->HasInputs("CacheKV")) { @@ -122,11 +137,11 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel { x_dim[0], c_dim[1])); // batch_size PADDLE_ENFORCE_EQ(c_dim[2], - y_dim[1], + trans_qkvw ? y_dim[1] : y_dim[2], paddle::platform::errors::InvalidArgument( "The third dim of CacheKV must be equal with num " "head %d, but got %d", - y_dim[1], + trans_qkvw ? y_dim[1] : y_dim[2], c_dim[2])); // num_head PADDLE_ENFORCE_GT( c_dim[3], @@ -135,11 +150,11 @@ class FusedMultiTransformerOp : public framework::OperatorWithKernel { "The forth dim of CacheKV must be greater than 0, but got %d", c_dim[3])); // cache_seq_len PADDLE_ENFORCE_EQ(c_dim[4], - y_dim[2], + trans_qkvw ? y_dim[2] : y_dim[3], paddle::platform::errors::InvalidArgument( "The fifth dim of CacheKV must be equal with head " "size %d, but got %d", - y_dim[2], + trans_qkvw ? y_dim[2] : y_dim[3], c_dim[4])); // head_size } @@ -258,6 +273,13 @@ class FusedMultiTransformerOpOpMaker "upscale_in_train")); }); AddAttr("act_method", "act_method").SetDefault("gelu"); + AddAttr( + "trans_qkvw", + "Whether the weights of qkv should be transposed. If true," + "the shape eights of qkv should be [3, num_head, dim_head, dim_embed]." + "Otherwise the shape of weights of qkv should be" + "[dim_embed, 3, num_head, dim_head]") + .SetDefault(true); AddAttr( "ring_id", @@ -278,3 +300,12 @@ REGISTER_OPERATOR( ops::FusedMultiTransformerOpOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_VERSION(fused_multi_transformer) + .AddCheckpoint( + R"ROC( + Add a new attribute [trans_qkvw] )ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "trans_qkvw", + "A flag to indicate whether to transpose for weights of qkv.", + true)); diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index ca2b884bf79f6..f806359093cb2 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -1119,17 +1119,23 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { // y: qkv's weight: [3, num_head, dim_head, dim_embed] auto qkv_weights = ctx.MultiInput("QKVW"); auto qkv_biases = ctx.MultiInput("QKVBias"); + const bool trans_qkvw = ctx.Attr("trans_qkvw"); const auto qkv_w_dims = qkv_weights[0]->dims(); - int num_head = qkv_w_dims[1]; - int dim_head = qkv_w_dims[2]; + int num_head = trans_qkvw ? qkv_w_dims[1] : qkv_w_dims[2]; + int dim_head = trans_qkvw ? qkv_w_dims[2] : qkv_w_dims[3]; int hidden_size = num_head * dim_head; int output_size = 3 * hidden_size; int input_size = dim_embed; bool compute_bias = qkv_biases.size() > 0 && time_step == nullptr; - // (transA, transB, compute_bias) = (false, true, false) - auto qkv_compute = AttnMatMul( - dev_ctx, false, true, bsz_seq, output_size, input_size, compute_bias); + // (transA, transB, compute_bias) = (false, trans_qkvw, false) + auto qkv_compute = AttnMatMul(dev_ctx, + false, + trans_qkvw, + bsz_seq, + output_size, + input_size, + compute_bias); Tensor qkv_out; auto *qkv_out_data = qkv_out.mutable_data({bsz, seq_len, 3, num_head, dim_head}, place); diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py index 3e4d015da1b11..506a282171bbd 100644 --- a/python/paddle/incubate/nn/functional/fused_transformer.py +++ b/python/paddle/incubate/nn/functional/fused_transformer.py @@ -680,6 +680,7 @@ def fused_multi_transformer(x, activation="gelu", training=False, mode='upscale_in_train', + trans_qkvw=True, ring_id=-1, name=None): r""" @@ -756,6 +757,9 @@ def fused_multi_transformer(x, - train: out = input * mask - inference: out = input * (1.0 - p) + trans_qkvw (bool, optional): Whether to transpose for weights of qkv. + If true, the shape eights of qkv should be [3, num_head, dim_head, dim_embed]. + Otherwise the shape of weights of qkv should be [dim_embed, 3, num_head, dim_head]. Default True. ring_id (int, optional): For distributed forward in tensor model parallel, only support NCCL. Default is -1, means not using mp. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -826,8 +830,8 @@ def fused_multi_transformer(x, ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases, cache_kvs, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon, 'dropout_rate', dropout_rate, 'is_test', not training, - 'dropout_implementation', mode, 'act_method', activation, 'ring_id', - ring_id) + 'dropout_implementation', mode, 'act_method', activation, + 'trans_qkvw', trans_qkvw, 'ring_id', ring_id) if cache_kvs is not None: return final_out, cache_kv_out return final_out @@ -875,6 +879,7 @@ def fused_multi_transformer(x, 'is_test': not training, 'dropout_implementation': mode, 'act_method': activation, + 'trans_qkvw': trans_qkvw, 'ring_id': ring_id } diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py index 4a8f7815ae9d8..ba14ac5b86529 100644 --- a/python/paddle/incubate/nn/layer/fused_transformer.py +++ b/python/paddle/incubate/nn/layer/fused_transformer.py @@ -1048,6 +1048,9 @@ class FusedMultiTransformer(Layer): is a list or tuple, the number of layers is obtained from `qkv_weight_attrs`. num_layers only takes effect when `qkv_weight_attrs` is not a list or tuple. Default: -1. nranks (int, optional): Distributed tensor model parallel nranks. Default is 1, means not using mp. + trans_qkvw (bool, optional): Whether to transpose for weights of qkv. + If true, the shape eights of qkv should be [3, num_head, dim_head, dim_embed]. + Otherwise the shape of weights of qkv should be [dim_embed, 3, num_head, dim_head]. Default: True. ring_id (int, optional): For distributed tensor model parallel. Default is -1, means not using mp. name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. @@ -1090,6 +1093,7 @@ def __init__(self, epsilon=1e-5, num_layers=-1, nranks=1, + trans_qkvw=True, ring_id=-1, name=None): super(FusedMultiTransformer, self).__init__() @@ -1105,6 +1109,7 @@ def __init__(self, self.normalize_before = normalize_before self._dtype = self._helper.get_default_dtype() self._epsilon = epsilon + self._trans_qkvw = trans_qkvw self._ring_id = ring_id self.embed_dim = embed_dim @@ -1161,7 +1166,8 @@ def get_attr(attrs, idx): shape=[embed_dim], is_bias=True) qkv_weight = self.create_parameter( - shape=[3, num_heads, self.head_dim, embed_dim], + shape=[3, num_heads, self.head_dim, embed_dim] + if trans_qkvw else [embed_dim, 3, num_heads, self.head_dim], attr=qkv_weight_attr, dtype=self._dtype, is_bias=False) @@ -1292,6 +1298,7 @@ def forward(self, src, attn_mask=None, caches=None, time_step=None): activation=self.activation, training=self.training, mode='upscale_in_train', + trans_qkvw=self._trans_qkvw, ring_id=self._ring_id, name=self.name) return out From 6467ca0da8013e0292b3ac1ead7647276e4181e5 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Thu, 30 Jun 2022 11:06:23 +0800 Subject: [PATCH 002/250] modify graph_pattern to thread_local (#43942) * modify graph_pattern to thread_local * modify graph_pattern to thread_local --- paddle/fluid/framework/ir/graph_pattern_detector.cc | 6 ++++++ paddle/fluid/framework/ir/graph_pattern_detector.h | 8 ++++++++ paddle/fluid/inference/api/analysis_predictor.cc | 4 ++++ 3 files changed, 18 insertions(+) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index d5b6122886850..7ad02fe5ab87f 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -29,6 +29,12 @@ using string::Style; size_t PDPattern::id_ = 0UL; +#ifdef PADDLE_WITH_TENSORRT +namespace patterns { +thread_local std::unordered_map KeyCounter::dic_; +} +#endif + PDNode *PDPattern::NewNode(const std::string &name) { if (!name.empty()) { PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 0cc216b6e0de2..29d645f6beba0 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -406,10 +406,18 @@ struct KeyCounter { return x; } +#ifdef PADDLE_WITH_TENSORRT + static int IncCounter(const std::string& key) { return dic_[key]++; } + static void CleanCounter() { dic_.clear(); } + + private: + static thread_local std::unordered_map dic_; +#else int IncCounter(const std::string& key) { return dic_[key]++; } private: std::unordered_map dic_; +#endif }; // Generate a unique PDNode's name with name_scope and id. diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 8a2083ea226b4..62f89e300bfbd 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1360,6 +1360,10 @@ CreatePaddlePredictor( config.SetInValid(); auto predictor_p = dynamic_cast(predictor.get()); +#ifdef PADDLE_WITH_TENSORRT + paddle::framework::ir::patterns::KeyCounter::Instance().CleanCounter(); +#endif + if (!predictor_p->Init(nullptr)) { return nullptr; } From f720e231e5e646b0d88c6e1c5ebf9a2ab010e591 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Thu, 30 Jun 2022 14:45:45 +0800 Subject: [PATCH 003/250] Remove boost::variant for FetchResultType (#43932) * Remove boost::variant for FetchResultType * Fix pybind errors --- .../details/async_ssa_graph_executor.cc | 2 +- .../details/fetch_async_op_handle.cc | 4 +- .../framework/details/fetch_op_handle.cc | 6 +- .../details/parallel_ssa_graph_executor.cc | 5 +- paddle/fluid/framework/feed_fetch_type.h | 2 +- paddle/fluid/framework/parallel_executor.cc | 86 ++++++++++++------- paddle/fluid/framework/parallel_executor.h | 6 +- paddle/fluid/pybind/pybind.cc | 40 +++++---- 8 files changed, 90 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index f22e62fa0aa5b..0ae69695549e5 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -174,7 +174,7 @@ FetchResultType AsyncSSAGraphExecutor::Run( HandleException(); FetchList ret; - auto &val = boost::get(fetch_data); + auto &val = BOOST_GET(FetchList, fetch_data); for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { if (data_is_lod_tensor(val.at(fetch_idx))) { std::vector lodtensor_ptrs; diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc index 8d8bb96f5c8ed..a9e4bf826bc4b 100644 --- a/paddle/fluid/framework/details/fetch_async_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc @@ -228,7 +228,7 @@ void FetchAsyncOpHandle::RunImpl() { } if (return_merged_) { - auto &val = boost::get(*data_); + auto &val = BOOST_GET(FetchList, *data_); if (src_vars[0]->IsType()) { // to lodtensor type std::vector src_lodtensors; @@ -263,7 +263,7 @@ void FetchAsyncOpHandle::RunImpl() { val.at(offset_) = std::move(dst_lodtensor_array); } } else { - auto &val = boost::get(*data_); + auto &val = BOOST_GET(FetchUnmergedList, *data_); auto &dst_tensors = val.at(offset_); dst_tensors.reserve(src_vars.size()); diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index f160650f0b9f4..a9f7de8ee312f 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -84,7 +84,7 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const { for (auto &t : tensors_) { tensors_ptr.emplace_back(&BOOST_GET_CONST(LoDTensor, t)); } - auto &val = boost::get(*data_); + auto &val = BOOST_GET(FetchList, *data_); LoDTensor var; MergeLoDTensor(&var, tensors_ptr, platform::CPUPlace()); val.at(offset_) = std::move(var); @@ -106,11 +106,11 @@ void FetchOpHandle::WaitAndMergeCPUFetchVars() const { tmp_array.emplace_back(); MergeLoDTensor(&(tmp_array.back()), tensors_ptr, platform::CPUPlace()); } - auto &val = boost::get(*data_); + auto &val = BOOST_GET(FetchList, *data_); val.at(offset_) = std::move(tmp_array); } } else { - auto &val = boost::get(*data_); + auto &val = BOOST_GET(FetchUnmergedList, *data_); val.at(offset_) = std::move(tensors_); } } diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 86536b74a3d7c..bc870c0eaa18d 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -278,7 +278,8 @@ FetchResultType ParallelSSAGraphExecutor::Run( if (!is_valid[scope_idx]) { continue; } - const auto &fetch_list = boost::get(fetch_data[scope_idx]); + const auto &fetch_list = + BOOST_GET_CONST(FetchList, fetch_data[scope_idx]); if (data_is_lod_tensor(fetch_list[fetch_idx])) { lodtensor_ptrs.push_back( &(BOOST_GET_CONST(LoDTensor, fetch_list[fetch_idx]))); @@ -317,7 +318,7 @@ FetchResultType ParallelSSAGraphExecutor::Run( continue; } const auto &fetch_list = - boost::get(fetch_data[scope_idx]); + BOOST_GET_CONST(FetchUnmergedList, fetch_data[scope_idx]); PADDLE_ENFORCE_EQ( fetch_list[fetch_idx].size(), 1, diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h index c86cdc998133b..8ecd6a0339b5b 100644 --- a/paddle/fluid/framework/feed_fetch_type.h +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -30,7 +30,7 @@ using FetchType = paddle::variant; using FetchList = std::vector; using FetchUnmergedList = std::vector>; -using FetchResultType = boost::variant; +using FetchResultType = paddle::variant; inline bool data_is_lod_tensor(const FetchType &data) { if (data.type() == typeid(LoDTensor)) { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index fffacc59ba7bc..697cb8cdcf6e8 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -972,37 +972,26 @@ void ParallelExecutor::BCastParamsToDevices( } } -FetchResultType ParallelExecutor::Run( - const std::vector &fetch_tensors, bool return_merged) { - platform::RecordEvent record_run( - "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1); - VLOG(3) << "enter ParallelExecutor Run"; -#ifdef PADDLE_WITH_CUDA - if (platform::IsCUDAGraphCapturing()) { - PADDLE_ENFORCE_EQ(fetch_tensors.empty(), - true, - platform::errors::InvalidArgument( - "Cannot fetch data when using CUDA Graph.")); - PADDLE_ENFORCE_EQ( - member_->build_strategy_.allow_cuda_graph_capture_, - true, - platform::errors::InvalidArgument( - "You must turn on build_strategy.allow_cuda_graph_capture = True " - "to enable CUDA Graph capturing.")); - PADDLE_ENFORCE_EQ( - member_->places_[0], - platform::CUDAGraphCapturingPlace(), - platform::errors::InvalidArgument("The place to capture CUDAGraph is " - "not the same as the place to run.")); - } -#endif +FetchUnmergedList ParallelExecutor::Run( + const std::vector &fetch_tensors) { + PreludeToRun(fetch_tensors); + platform::RecordBlock b(0); -#ifdef WITH_GPERFTOOLS - if (gProfileStarted) { - ProfilerFlush(); - } -#endif + ResetHasFeedGuard reset_has_feed_guard(member_); + + ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), + fetch_tensors, + member_->HasGarbageCollectors()); + VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run"; + auto fetch_data = + member_->executor_->Run(fetch_tensors, /*return_merged=*/false); + return BOOST_GET(FetchUnmergedList, fetch_data); +} + +FetchList ParallelExecutor::RunAndMerge( + const std::vector &fetch_tensors) { + PreludeToRun(fetch_tensors); platform::RecordBlock b(0); ResetHasFeedGuard reset_has_feed_guard(member_); @@ -1011,9 +1000,10 @@ FetchResultType ParallelExecutor::Run( fetch_tensors, member_->HasGarbageCollectors()); - VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run"; - auto fetch_data = member_->executor_->Run(fetch_tensors, return_merged); - return fetch_data; + VLOG(3) << "ParallelExecutor begin to run member_->executor_->RunAndMerge"; + auto fetch_data = + member_->executor_->Run(fetch_tensors, /*return_merged=*/true); + return BOOST_GET(FetchList, fetch_data); } void ParallelExecutor::RunWithoutFetch( @@ -1440,6 +1430,38 @@ std::vector ParallelExecutor::CloneGraphToMultiDevices( return graphs; } +void ParallelExecutor::PreludeToRun( + const std::vector &fetch_tensors) { + platform::RecordEvent record_run( + "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1); + VLOG(3) << "enter ParallelExecutor Run"; +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ(fetch_tensors.empty(), + true, + platform::errors::InvalidArgument( + "Cannot fetch data when using CUDA Graph.")); + PADDLE_ENFORCE_EQ( + member_->build_strategy_.allow_cuda_graph_capture_, + true, + platform::errors::InvalidArgument( + "You must turn on build_strategy.allow_cuda_graph_capture = True " + "to enable CUDA Graph capturing.")); + PADDLE_ENFORCE_EQ( + member_->places_[0], + platform::CUDAGraphCapturingPlace(), + platform::errors::InvalidArgument("The place to capture CUDAGraph is " + "not the same as the place to run.")); + } +#endif + +#ifdef WITH_GPERFTOOLS + if (gProfileStarted) { + ProfilerFlush(); + } +#endif +} + void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { if (member_->build_strategy_.reduce_ == BuildStrategy::ReduceStrategy::kNoReduce) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 4cb9c0340b53c..a3b812a71a2b7 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -89,8 +89,8 @@ class ParallelExecutor { void FeedAndSplitTensorIntoLocalScopes( const std::unordered_map &tensors); - FetchResultType Run(const std::vector &fetch_tensors, - bool return_merged = true); + FetchUnmergedList Run(const std::vector &fetch_tensors); + FetchList RunAndMerge(const std::vector &fetch_tensors); void RunWithoutFetch(const std::vector &skip_eager_vars); @@ -126,6 +126,8 @@ class ParallelExecutor { std::vector CloneGraphToMultiDevices(ir::Graph *graph); + void PreludeToRun(const std::vector &fetch_tensors); + void PrepareNCCLCommunicator(Scope *global_scope); std::vector CompileGraphWithBuildStrategy( diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c294c8eb4a7c9..18a3fb1aab86b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3225,13 +3225,17 @@ All parameter, weight, gradient are variables in Paddle. #endif m.def("set_feed_variable", - static_cast( - &framework::SetFeedVariable)); + static_cast(&framework::SetFeedVariable)); m.def("set_feed_variable", - static_cast( - &framework::SetFeedVariable)); + static_cast(&framework::SetFeedVariable)); m.def("get_fetch_variable", [](const Scope &scope, const std::string &var_name, @@ -4601,20 +4605,20 @@ All parameter, weight, gradient are variables in Paddle. [](ParallelExecutor &self, const std::vector &fetch_tensors, bool return_merged) -> py::object { - paddle::framework::FetchResultType ret; - { - pybind11::gil_scoped_release release; - ret = self.Run(fetch_tensors, return_merged); - } - - // TODO(Ruibiao): Refactor the run interface of PE to avoid use - // boost::get here if (return_merged) { - return py::cast( - std::move(boost::get(ret))); + paddle::framework::FetchList ret; + /*gil_scoped_release*/ { + pybind11::gil_scoped_release release; + ret = self.RunAndMerge(fetch_tensors); + } + return py::cast(std::move(ret)); } else { - return py::cast(std::move( - boost::get(ret))); + paddle::framework::FetchUnmergedList ret; + /*gil_scoped_release*/ { + pybind11::gil_scoped_release release; + ret = self.Run(fetch_tensors); + } + return py::cast(std::move(ret)); } }) .def("device_count", &ParallelExecutor::DeviceCount); From 8279dfeae676fd6df72f4215fb14055c41f6d409 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 30 Jun 2022 15:00:28 +0800 Subject: [PATCH 004/250] [Dy2Static] Add non-local for while and for. (#43864) * merge and add base support for non-local for * for and while non-local support * fix ci errors: v1 * fix bug * fix * fix code * fix * fix * fix --- .../dygraph_to_static/convert_operators.py | 29 +++- .../dygraph_to_static/ifelse_transformer.py | 80 ++------- .../dygraph_to_static/loop_transformer.py | 157 ++++++++--------- .../fluid/dygraph/dygraph_to_static/utils.py | 161 +++++++++++++++++- .../dygraph_to_static/variable_trans_func.py | 20 +-- .../unittests/dygraph_to_static/test_list.py | 1 - .../unittests/dygraph_to_static/test_loop.py | 2 +- 7 files changed, 268 insertions(+), 182 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py index cbb4655f354a5..a6cab0db51380 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py @@ -24,7 +24,7 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar -def convert_while_loop(cond, body, loop_vars): +def convert_while_loop(cond, body, getter, setter): """ A function representation of a Python ``while`` statement. @@ -39,25 +39,36 @@ def convert_while_loop(cond, body, loop_vars): # NOTE: It may be slower if cond is very expensive, but usually cond is just O(1). # If loop_vars is changed during cond callable, then it causes bug, but current logical_and/logical_not/... doesn't change the loop_vars. - pred = cond(*loop_vars) + pred = cond() if isinstance(pred, Variable): - loop_vars = _run_paddle_while_loop(cond, body, loop_vars) + loop_vars = _run_paddle_while(cond, body, getter, setter) else: - loop_vars = _run_py_while(cond, body, loop_vars) + loop_vars = _run_py_while(cond, body, getter, setter) return loop_vars -def _run_paddle_while_loop(cond, body, loop_vars): +def _run_paddle_while(cond, body, getter, setter): # NOTE: loop_vars of Paddle op `control_flow.while_loop` must be Paddle Tensors. - loop_vars = [to_static_variable(var) for var in loop_vars] + def to_list(x): + if isinstance(x, (tuple, list)): return x + return [x] + + # UndefinedVar will become data layer not check. + loop_vars = [to_static_variable(var) for var in to_list(getter())] + setter(loop_vars if len(loop_vars) > 1 else + loop_vars[0]) # change the non-local var to variable + # variable maybe modified to inner var. change it into loop_vars = control_flow.while_loop(cond, body, loop_vars) + setter(loop_vars if len(loop_vars) > 1 else + loop_vars[0]) # change the non-local var to variable return loop_vars -def _run_py_while(cond, body, loop_vars): - while cond(*loop_vars): - loop_vars = body(*loop_vars) +def _run_py_while(cond, body, getter, setter): + loop_vars = getter() + while cond(): + loop_vars = body() return loop_vars diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py index 1935629f54e86..d4449f6dfc24e 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py @@ -31,7 +31,8 @@ from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var -from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_nonlocal_stmt_node +from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_node +from paddle.fluid.dygraph.dygraph_to_static.utils import create_get_args_node, create_set_args_node TRUE_FUNC_PREFIX = 'true_fn' FALSE_FUNC_PREFIX = 'false_fn' @@ -415,17 +416,22 @@ def _vars_loaded(ids_dict): # modified vars body_modified_vars = _modified_vars(if_vars_dict, parent_vars_dict) + body_modified_vars = set( + filter(lambda x: x != ARGS_NAME, body_modified_vars)) orelse_modified_vars = _modified_vars(else_vars_dict, parent_vars_dict) + orelse_modified_vars = set( + filter(lambda x: x != ARGS_NAME, orelse_modified_vars)) modified_vars = body_modified_vars | orelse_modified_vars # new vars + # TODO(remove __args when new FunctionScopeAnalysis has been used.) body_new_vars = set([ var for var in _vars_with_store(if_vars_dict) - if var not in parent_vars_dict + if var not in parent_vars_dict and var != ARGS_NAME ]) orelse_new_vars = set([ var for var in _vars_with_store(else_vars_dict) - if var not in parent_vars_dict + if var not in parent_vars_dict and var != ARGS_NAME ]) new_vars_in_body_or_orelse = body_new_vars | orelse_new_vars new_vars_in_one_of_body_or_orelse = body_new_vars ^ orelse_new_vars @@ -511,11 +517,11 @@ def transform_if_else(node, root): if any([not isinstance(ctx, gast.Load) for ctx in ctxs]): parent_ids_set.add(k) - trun_args = parse_cond_args(parent_ids_set, body_name_ids, + true_args = parse_cond_args(parent_ids_set, body_name_ids, modified_name_ids_from_parent) false_args = parse_cond_args(parent_ids_set, orelse_name_ids, modified_name_ids_from_parent) - nonlocal_names = list(trun_args | false_args | new_vars_to_create) + nonlocal_names = list(true_args | false_args | new_vars_to_create) nonlocal_names.sort() # NOTE: All var in return_name_ids should be in nonlocal_names. nonlocal_names = _valid_nonlocal_names(return_name_ids, nonlocal_names) @@ -552,70 +558,6 @@ def transform_if_else(node, root): return create_new_vars_in_parent_stmts, true_func_node, false_func_node, get_args_node, set_args_node, return_name_ids -def create_get_args_node(names): - """ - Create get_args function as follows: - - def get_args_0(): - nonlocal x, y - return x, y - """ - - def empty_node(): - func_def = """ - def {func_name}(): - return - """.format(func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX)) - return gast.parse(textwrap.dedent(func_def)).body[0] - - assert isinstance(names, (list, tuple)) - if not names: - return empty_node() - - template = """ - def {func_name}(): - nonlocal {vars} - return {vars} - """ - func_def = template.format( - func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX), - vars=",".join(names)) - return gast.parse(textwrap.dedent(func_def)).body[0] - - -def create_set_args_node(names): - """ - Create set_args function as follows: - - def set_args_0(__args): - nonlocal x, y - x, y = __args - """ - - def empty_node(): - func_def = """ - def {func_name}({args}): - pass - """.format(func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX), - args=ARGS_NAME) - return gast.parse(textwrap.dedent(func_def)).body[0] - - assert isinstance(names, (list, tuple)) - if not names: - return empty_node() - - template = """ - def {func_name}({args}): - nonlocal {vars} - {vars} = {args} - """ - func_def = template.format( - func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX), - args=ARGS_NAME, - vars=",".join(names)) - return gast.parse(textwrap.dedent(func_def)).body[0] - - def create_convert_ifelse_node(return_name_ids, pred, true_func, diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index 832c502c0aa5c..63fc4f0489acb 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -28,7 +28,10 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import ForLoopTuplePreTransformer from paddle.fluid.dygraph.dygraph_to_static.utils import ForNodeVisitor from paddle.fluid.dygraph.dygraph_to_static.utils import RenameTransformer +from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node +from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_node, create_get_args_node, create_set_args_node +from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import ARGS_NAME __all__ = ['LoopTransformer', 'NameVisitor'] @@ -37,12 +40,10 @@ FOR_CONDITION_PREFIX = 'for_loop_condition' FOR_BODY_PREFIX = 'for_loop_body' -GENERATE_VARIABLE_PREFIX = 'generate_variable' -ATTRIBUTE_VARIABLE_PREFIX = '__attribute_variable' - -def create_while_nodes(condition_name, body_name, loop_var_names): +def create_while_nodes(condition_name, body_name, loop_var_names, getter_name, + setter_name): """ Returns a list of gast.Node which represents the calling of Paddle controlflow while_loop. @@ -74,37 +75,20 @@ def create_while_nodes(condition_name, body_name, loop_var_names): # # For example: loop_var_names = [a, b, foo.x], the type of `a` or `b` is gast.Name, # but the type of `foo.x` gast.Attribute. - - unique_name_to_origin = {} # We have to make loop_var_names and assign_loop_var_names with same order # set doesn't have order so we convert it to list loop_var_names = list(loop_var_names) assign_loop_var_names = [] for name in (loop_var_names): - if "." in name: - # name is an attribute variable such as foo.x - tmp_attr_name = unique_name.generate(ATTRIBUTE_VARIABLE_PREFIX) - unique_name_to_origin[tmp_attr_name] = name - assign_loop_var_names.append(tmp_attr_name) - else: - assign_loop_var_names.append(name) + assign_loop_var_names.append(name) while_func_name = "_jst.While" - while_node_str = "[{}] = {}({}, {}, [{}])".format( - ",".join(assign_loop_var_names), while_func_name, condition_name, - body_name, ",".join(loop_var_names)) + while_node_str = "{}({}, {}, {}, {})".format(while_func_name, + condition_name, body_name, + getter_name, setter_name) while_node = gast.parse(while_node_str).body[0] ret = [while_node] - for tmp_attr_name in unique_name_to_origin: - origin_attr_var = unique_name_to_origin[tmp_attr_name] - dot_pos = origin_attr_var.rindex(".") - obj_name = origin_attr_var[0:dot_pos] - attr_name = origin_attr_var[dot_pos + 1:] - assign_if_not_prop_str = "if not isinstance(getattr(type({}), '{}', None), property): {} = {}".format( - obj_name, attr_name, origin_attr_var, tmp_attr_name) - assign_if_not_prop_node = gast.parse(assign_if_not_prop_str).body[0] - ret.append(assign_if_not_prop_node) return ret @@ -117,8 +101,10 @@ def __init__(self): self.globals = set() self.nonlocals = set() self.args = set() - self.w_vars = set() # all vars been stored, + # all vars been stored, # may be globals or non-locals + self.w_vars = set() + def created_vars(self): return self.w_vars - self.globals - self.nonlocals - self.args @@ -282,9 +268,7 @@ def get_loop_var_names(self, node): # If this var is a basic variable and read-only and not # condition var, it may not be loop_var else it should # be in loop_var as input - if (not name in condition_names) and ( - not name in write_names - ) and self._node_var_type_is_basic(name_to_type[name]): + if (not name in condition_names) and (not name in write_names): continue loop_var_names.add(name) @@ -645,7 +629,6 @@ def get_for_stmt_nodes(self, node): if stmts_tuple is None: return [node] init_stmts, cond_stmt, body_stmts = stmts_tuple - # 2. get original loop vars loop_var_names, create_var_names = self.name_visitor.get_loop_var_names( node) @@ -672,7 +655,16 @@ def get_for_stmt_nodes(self, node): # We need to create static variable for those variables for name in create_var_names: if "." not in name: - new_stmts.append(create_fill_constant_node(name)) + new_stmts.append(create_undefined_var(name)) + + # create non-local statement for body and cond. + nonlocal_names = list(loop_var_names | create_var_names) + nonlocal_names.sort() + # TODO(dev): Need a better way to deal this. + if ARGS_NAME in nonlocal_names: + nonlocal_names.remove(ARGS_NAME) + + nonlocal_stmt_node = [create_nonlocal_stmt_node(nonlocal_names)] # 4. append init statements new_stmts.extend(init_stmts) @@ -680,63 +672,54 @@ def get_for_stmt_nodes(self, node): # 5. create & append condition function node condition_func_node = gast.FunctionDef( name=unique_name.generate(FOR_CONDITION_PREFIX), - args=gast.arguments(args=[ - gast.Name(id=name, - ctx=gast.Param(), - annotation=None, - type_comment=None) for name in loop_var_names - ], + args=gast.arguments(args=[], posonlyargs=[], - vararg=None, + vararg=gast.Name(id=ARGS_NAME, + ctx=gast.Param(), + annotation=None, + type_comment=None), kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), - body=[gast.Return(value=cond_stmt)], + body=nonlocal_stmt_node + [gast.Return(value=cond_stmt)], decorator_list=[], returns=None, type_comment=None) - for name in loop_var_names: - if "." in name: - rename_transformer = RenameTransformer(condition_func_node) - rename_transformer.rename( - name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(condition_func_node) # 6. create & append loop body function node # append return values for loop body body_stmts.append( gast.Return(value=generate_name_node( - loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True))) + nonlocal_names, ctx=gast.Load(), gen_tuple_if_single=True))) body_func_node = gast.FunctionDef( name=unique_name.generate(FOR_BODY_PREFIX), - args=gast.arguments(args=[ - gast.Name(id=name, - ctx=gast.Param(), - annotation=None, - type_comment=None) for name in loop_var_names - ], + args=gast.arguments(args=[], posonlyargs=[], - vararg=None, + vararg=gast.Name(id=ARGS_NAME, + ctx=gast.Param(), + annotation=None, + type_comment=None), kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), - body=body_stmts, + body=nonlocal_stmt_node + body_stmts, decorator_list=[], returns=None, type_comment=None) - for name in loop_var_names: - if "." in name: - rename_transformer = RenameTransformer(body_func_node) - rename_transformer.rename( - name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(body_func_node) + get_args_node = create_get_args_node(nonlocal_names) + set_args_node = create_set_args_node(nonlocal_names) # 7. create & append while loop node while_loop_nodes = create_while_nodes(condition_func_node.name, body_func_node.name, - loop_var_names) + nonlocal_names, + get_args_node.name, + set_args_node.name) + new_stmts.extend([get_args_node, set_args_node]) new_stmts.extend(while_loop_nodes) return new_stmts @@ -746,6 +729,15 @@ def get_while_stmt_nodes(self, node): node) new_stmts = [] + # create non-local statement for body and cond. + nonlocal_names = list(loop_var_names | create_var_names) + nonlocal_names.sort() + # TODO(dev): Need a better way to deal this. + if ARGS_NAME in nonlocal_names: + nonlocal_names.remove(ARGS_NAME) + + nonlocal_stmt_node = [create_nonlocal_stmt_node(nonlocal_names)] + # Python can create variable in loop and use it out of loop, E.g. # # while x < 10: @@ -760,61 +752,52 @@ def get_while_stmt_nodes(self, node): condition_func_node = gast.FunctionDef( name=unique_name.generate(WHILE_CONDITION_PREFIX), - args=gast.arguments(args=[ - gast.Name(id=name, - ctx=gast.Param(), - annotation=None, - type_comment=None) for name in loop_var_names - ], + args=gast.arguments(args=[], posonlyargs=[], - vararg=None, + vararg=gast.Name(id=ARGS_NAME, + ctx=gast.Param(), + annotation=None, + type_comment=None), kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), - body=[gast.Return(value=node.test)], + body=nonlocal_stmt_node + [gast.Return(value=node.test)], decorator_list=[], returns=None, type_comment=None) - for name in loop_var_names: - if "." in name: - rename_transformer = RenameTransformer(condition_func_node) - rename_transformer.rename( - name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(condition_func_node) new_body = node.body new_body.append( gast.Return(value=generate_name_node( - loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True))) + nonlocal_names, ctx=gast.Load(), gen_tuple_if_single=True))) body_func_node = gast.FunctionDef( name=unique_name.generate(WHILE_BODY_PREFIX), - args=gast.arguments(args=[ - gast.Name(id=name, - ctx=gast.Param(), - annotation=None, - type_comment=None) for name in loop_var_names - ], + args=gast.arguments(args=[], posonlyargs=[], - vararg=None, + vararg=gast.Name(id=ARGS_NAME, + ctx=gast.Param(), + annotation=None, + type_comment=None), kwonlyargs=[], kw_defaults=None, kwarg=None, defaults=[]), - body=new_body, + body=nonlocal_stmt_node + new_body, decorator_list=[], returns=None, type_comment=None) - for name in loop_var_names: - if "." in name: - rename_transformer = RenameTransformer(body_func_node) - rename_transformer.rename( - name, unique_name.generate(GENERATE_VARIABLE_PREFIX)) new_stmts.append(body_func_node) + get_args_node = create_get_args_node(nonlocal_names) + set_args_node = create_set_args_node(nonlocal_names) while_loop_nodes = create_while_nodes(condition_func_node.name, body_func_node.name, - loop_var_names) + nonlocal_names, + get_args_node.name, + set_args_node.name) + new_stmts.extend([get_args_node, set_args_node]) new_stmts.extend(while_loop_nodes) return new_stmts diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 8dd11c06e463f..466e9ee4d34c1 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -30,6 +30,8 @@ import paddle from paddle.fluid import unique_name from paddle.fluid.data_feeder import convert_dtype +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid import core # Note(Aurelius): Do not forget the dot `.` to distinguish other # module such as paddlenlp. @@ -59,6 +61,51 @@ def visit(self, node): return ret +def data_layer_not_check(name, shape, dtype='float32', lod_level=0): + """ + This function creates a Tensor on the global block. The created Tensor + doesn't check the dtype and the shape of feed data because dygraph input + data can be various-length. This API is used in translating dygraph into + static graph. + + Note: + The default :code:`stop_gradient` attribute of the Tensor created by + this API is true, which means the gradient won't be passed backward + through the data Tensor. Set :code:`var.stop_gradient = False` If + user would like to pass backward gradient. + + Args: + name (str): The name/alias of the Tensor, see :ref:`api_guide_Name` + for more details. + shape (list|tuple): List|Tuple of integers declaring the shape. You can + set "None" at a dimension to indicate the dimension can be of any + size. For example, it is useful to set changeable batch size as "None" + dtype (np.dtype|VarType|str, optional): The type of the data. Supported + dtype: bool, float16, float32, float64, int8, int16, int32, int64, + uint8. Default: float32 + lod_level (int, optional): The LoD level of the LoDTensor. Usually users + don't have to set this value. For more details about when and how to + use LoD level, see :ref:`user_guide_lod_tensor` . Default: 0 + + Returns: + Tensor: The global Tensor that gives access to the data. + """ + helper = LayerHelper('data', **locals()) + shape = list(shape) + for i in six.moves.range(len(shape)): + if shape[i] is None: + shape[i] = -1 + + return helper.create_variable(name=name, + shape=shape, + dtype=dtype, + type=core.VarDesc.VarType.LOD_TENSOR, + stop_gradient=True, + lod_level=lod_level, + is_data=True, + need_check_feed=False) + + # imp is deprecated in python3 from importlib.machinery import SourceFileLoader @@ -412,10 +459,16 @@ def generate_name_node(name_ids, ctx=gast.Load(), gen_tuple_if_single=False): raise TypeError( 'name_ids must be list or tuple or set, but received %s' % type(type(name_ids))) - gast_names = [ - gast.Name(id=name_id, ctx=ctx, annotation=None, type_comment=None) - for name_id in name_ids - ] + + def create_node_for_name(name): + if '.' not in name: + return gast.Name(id=name, + ctx=ctx, + annotation=None, + type_comment=None) + return gast.parse(name).body[0].value + + gast_names = [create_node_for_name(name_id) for name_id in name_ids] if len(gast_names) == 1 and not gen_tuple_if_single: name_node = gast_names[0] else: @@ -842,6 +895,16 @@ def visit_Name(self, node): return self.replace_node return node + def visit_Nonlocal(self, node): + names = node.names + + def replace(s): + if s == self.target_name: return self.replace_node.id + return s + + node.names = list(map(replace, names)) + return node + class ForLoopTuplePreTransformer(gast.NodeTransformer): """ @@ -1527,3 +1590,93 @@ def slice_is_num(slice_node): return True return False + + +def create_get_args_node(names): + """ + Create get_args function as follows: + + def get_args_0(): + nonlocal x, y + return x, y + """ + + def empty_node(): + func_def = """ + def {func_name}(): + return + """.format(func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX)) + return gast.parse(textwrap.dedent(func_def)).body[0] + + assert isinstance(names, (list, tuple)) + if not names: + return empty_node() + + mapped = list(filter(lambda n: '.' not in n, names)) + nonlocal_names = sorted( + mapped, + key=mapped.index) # to keep the order, we can't use set() to unique + template = """ + def {func_name}(): + nonlocal {nonlocal_vars} + return {vars} + """ + func_def = template.format( + func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX), + nonlocal_vars=','.join(nonlocal_names), + vars=",".join(names)) + return gast.parse(textwrap.dedent(func_def)).body[0] + + +GET_ARGS_FUNC_PREFIX = 'get_args' +SET_ARGS_FUNC_PREFIX = 'set_args' +ARGS_NAME = '__args' + + +def create_set_args_node(names): + """ + Create set_args function as follows: + + def set_args_0(__args): + nonlocal x, y + x, y = __args + """ + + def empty_node(): + func_def = """ + def {func_name}({args}): + pass + """.format(func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX), + args=ARGS_NAME) + return gast.parse(textwrap.dedent(func_def)).body[0] + + assert isinstance(names, (list, tuple)) + if not names: + return empty_node() + + mapped = list(filter(lambda n: '.' not in n, names)) + nonlocal_names = sorted( + mapped, + key=mapped.index) # to keep the order, we can't use set() to unique + template = """ + def {func_name}({args}): + nonlocal {nonlocal_vars} + {vars} = {args} + """ + func_def = template.format( + func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX), + args=ARGS_NAME, + nonlocal_vars=','.join(nonlocal_names), + vars=",".join(names)) + return gast.parse(textwrap.dedent(func_def)).body[0] + + +def create_nonlocal_stmt_node(names): + assert isinstance(names, (list, tuple)) + + mapped = list(filter(lambda n: '.' not in n, names)) + names = sorted( + mapped, + key=mapped.index) # to keep the order, we can't use set() to unique + func_code = "nonlocal {}".format(','.join(names)) + return gast.parse(func_code).body[0] diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py index 92ef7a3f13d9b..9bbce59fc54ce 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py @@ -16,15 +16,17 @@ import six import paddle +import textwrap from paddle.utils import gast -from paddle.fluid import core from paddle.fluid import unique_name from paddle.fluid.framework import Variable -from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, data_layer_not_check __all__ = [ - 'create_bool_as_type', 'create_fill_constant_node', 'to_static_variable', - 'create_undefined_var' + 'create_bool_as_type', + 'create_fill_constant_node', + 'to_static_variable', + 'create_undefined_var', ] @@ -33,12 +35,6 @@ def create_undefined_var(name): return gast.parse(func_code).body[0] -def create_nonlocal_stmt_node(names): - assert isinstance(names, (list, tuple)) - func_code = "nonlocal {}".format(','.join(names)) - return gast.parse(func_code).body[0] - - def create_fill_constant_node(name, value=0): func_code = "{} = paddle.full(shape=[1], ".format(name) if isinstance(value, bool): @@ -66,7 +62,9 @@ def to_static_variable(x): return paddle.full(shape=[1], dtype='float64', fill_value=x) if isinstance(x, six.integer_types): return paddle.full(shape=[1], dtype='int64', fill_value=x) - + if isinstance(x, UndefinedVar): + return data_layer_not_check(unique_name.generator("loop_undefined_var"), + [-1]) return x diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py index f573960b5dba0..1d64e7b81849f 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py @@ -177,7 +177,6 @@ def test_list_pop_in_for_loop(x, iter_num): one = fluid.layers.ones(shape=[1], dtype="int32") for i in range(one.numpy()[0]): item = a.pop() - return a[0], item, b[1] diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py index 78d97a3884aed..683135b9078dc 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py @@ -270,7 +270,7 @@ def test_nested_loop_vars(self): self.loop_var_names = [ set(["j", "two"]), set(["i", "three", "b"]), - set(["i", "j"]) + set(["i"]) ] self.create_var_names = [set(), set(["b"]), set()] From 99a4ff8fe4be92c982177b735b176aa8f55fae71 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 30 Jun 2022 15:10:09 +0800 Subject: [PATCH 005/250] [new-exec] support runing with different scope and the same program using scope_guard (#43962) * support scope_guard * fix test --- .../new_executor/standalone_executor.cc | 19 +++++++++----- .../new_executor/standalone_executor.h | 6 ++++- paddle/fluid/pybind/pybind.cc | 12 ++++++--- python/paddle/fluid/executor.py | 8 +++--- .../interpreter/test_standalone_executor.py | 26 ++++++++++++------- 5 files changed, 47 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 3ef0a827c2480..31b1627dc650a 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -28,44 +28,50 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, scope_(scope) { // NOTE(zhiqiu): for startup_program, run once ? if (startup_prog.Block(0).AllOps().size() > 0) { - auto core = GetInterpreterCore(startup_prog, {}, {}, false); + auto core = GetInterpreterCore(scope, startup_prog, {}, {}, false); VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core; core->Run({}); } } paddle::framework::FetchList StandaloneExecutor::Run( + Scope* scope, const std::vector& feed_names, const std::vector& feed_tensors, const std::vector& fetch_names) { platform::RecordEvent record_event( "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1); - auto core = GetInterpreterCore(main_prog_, feed_names, fetch_names, true); + auto core = + GetInterpreterCore(scope, main_prog_, feed_names, fetch_names, true); return core->Run(feed_names, feed_tensors); } paddle::framework::FetchList StandaloneExecutor::Run( + Scope* scope, const std::vector& feed_names, const std::vector& fetch_names) { platform::RecordEvent record_event( "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1); - auto core = GetInterpreterCore(main_prog_, feed_names, fetch_names, false); + auto core = + GetInterpreterCore(scope, main_prog_, feed_names, fetch_names, false); VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core; return core->Run(feed_names); } framework::interpreter::CostInfo StandaloneExecutor::DryRun( + Scope* scope, const std::vector& feed_names, const std::vector& feed_tensors) { - auto core = GetInterpreterCore(main_prog_, feed_names, {}, true); + auto core = GetInterpreterCore(scope, main_prog_, feed_names, {}, true); return core->DryRun(feed_names, feed_tensors); } std::shared_ptr StandaloneExecutor::GetInterpreterCore( + Scope* scope, const ProgramDesc& prog, const std::vector& feed_names, const std::vector& fetch_names, @@ -79,6 +85,7 @@ std::shared_ptr StandaloneExecutor::GetInterpreterCore( for (auto& fetchname : fetch_names) { oss << fetchname << ","; } + oss << "scope:" << scope; auto iter = interpretercores_.find(oss.str()); @@ -89,13 +96,13 @@ std::shared_ptr StandaloneExecutor::GetInterpreterCore( std::shared_ptr core = nullptr; if (add_fetch_op) { - core = CreateInterpreterCore(place_, prog, scope_, fetch_names); + core = CreateInterpreterCore(place_, prog, scope, fetch_names); } else { core = std::make_shared( place_, prog.Block(0), /*skip_gc_vars=*/std::set(), - scope_); + scope); } interpretercores_.emplace(oss.str(), core); return core; diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index 7b54a855007be..5b9c48009ea83 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -39,6 +39,7 @@ class StandaloneExecutor { ~StandaloneExecutor() {} paddle::framework::FetchList Run( + Scope* scope, const std::vector& feed_names, const std::vector& feed_tensors, const std::vector& fetch_names); @@ -46,15 +47,18 @@ class StandaloneExecutor { // NOTE(zhiqiu): feed_names are only used for caching interpretercore. // fetch_names are used for caching interpretercore and inserting fetch ops, // the latter can be moved to python side. - paddle::framework::FetchList Run(const std::vector& feed_names, + paddle::framework::FetchList Run(Scope* scope, + const std::vector& feed_names, const std::vector& fetch_names); framework::interpreter::CostInfo DryRun( + Scope* scope, const std::vector& feed_names, const std::vector& feed_tensors); private: std::shared_ptr GetInterpreterCore( + Scope* scope, const ProgramDesc& prog, const std::vector& feed_names, const std::vector& fetch_names, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 18a3fb1aab86b..7b7e9d1a6c9ed 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3063,6 +3063,7 @@ All parameter, weight, gradient are variables in Paddle. Scope *>()) .def("run", [](StandaloneExecutor &self, + Scope *scope, const std::unordered_map &input_dict, std::vector fetch_names) { std::vector feed_tensors; @@ -3079,12 +3080,13 @@ All parameter, weight, gradient are variables in Paddle. paddle::framework::FetchList ret; { pybind11::gil_scoped_release release; - ret = self.Run(feed_names, feed_tensors, fetch_names); + ret = self.Run(scope, feed_names, feed_tensors, fetch_names); } return py::cast(std::move(ret)); }) .def("run", [](StandaloneExecutor &self, + Scope *scope, const std::unordered_map &input_dict, std::vector fetch_names) { @@ -3099,23 +3101,25 @@ All parameter, weight, gradient are variables in Paddle. paddle::framework::FetchList ret; { pybind11::gil_scoped_release release; - ret = self.Run(feed_names, feed_tensors, fetch_names); + ret = self.Run(scope, feed_names, feed_tensors, fetch_names); } return py::cast(std::move(ret)); }) .def("run", [](StandaloneExecutor &self, + Scope *scope, std::vector feed_names, std::vector fetch_names) { paddle::framework::FetchList ret; { pybind11::gil_scoped_release release; - ret = self.Run(feed_names, fetch_names); + ret = self.Run(scope, feed_names, fetch_names); } return py::cast(std::move(ret)); }) .def("dry_run", [](StandaloneExecutor &self, + Scope *scope, const std::unordered_map &input_dict) { std::vector feed_tensors; std::vector feed_names; @@ -3131,7 +3135,7 @@ All parameter, weight, gradient are variables in Paddle. framework::interpreter::CostInfo cost_info; { pybind11::gil_scoped_release release; - cost_info = self.DryRun(feed_names, feed_tensors); + cost_info = self.DryRun(scope, feed_names, feed_tensors); } return cost_info; }); diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 62578eef86cfc..d932b3f219bc2 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -537,7 +537,7 @@ def __init__(self, place, main_program, scope): self._scope = scope self._new_exe = self._create_new_executor() - def run(self, feed_names, fetch_list, return_numpy=True): + def run(self, scope, feed_names, fetch_list, return_numpy=True): """ Args: feed_names(list): This parameter represents the input names of the model. @@ -549,7 +549,8 @@ def run(self, feed_names, fetch_list, return_numpy=True): """ fetch_list = self._check_fetch(fetch_list) - tensors = self._new_exe.run(feed_names, fetch_list)._move_to_list() + tensors = self._new_exe.run(scope, feed_names, + fetch_list)._move_to_list() if return_numpy: return as_numpy(tensors, copy=True) else: @@ -1470,7 +1471,8 @@ def _can_use_interpreter_core(program, place): cpu_tensor = _as_lodtensor(data, core.CPUPlace()) tensor._copy_from(cpu_tensor, self.place) - return new_exe.run(list(feed.keys()), fetch_list, return_numpy) + return new_exe.run(scope, list(feed.keys()), fetch_list, + return_numpy) compiled = isinstance(program, compiler.CompiledProgram) diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index 9e375126550cc..6fa419ae28228 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -50,27 +50,30 @@ def build_program(self): def test_interp_base(self): startup_program, main_program, c = self.build_program() + scope = core.Scope() standaloneexecutor = StandaloneExecutor(self.place, startup_program.desc, - main_program.desc, core.Scope()) + main_program.desc, scope) out = standaloneexecutor.run( - {"a": np.ones([2, 2], dtype="float32") * 2}, [c.name]) + scope, {"a": np.ones([2, 2], dtype="float32") * 2}, [c.name]) for i in range(10): out = standaloneexecutor.run( - {"a": np.ones([2, 2], dtype="float32") * i}, [c.name]) + scope, {"a": np.ones([2, 2], dtype="float32") * i}, [c.name]) for i in range(10): out = standaloneexecutor.run( - {"a": np.ones([2, 2], dtype="float32") * i}, ['a', c.name]) + scope, {"a": np.ones([2, 2], dtype="float32") * i}, + ['a', c.name]) def test_dry_run(self): + scope = core.Scope() startup_program, main_program, c = self.build_program() standaloneexecutor = StandaloneExecutor(self.place, startup_program.desc, - main_program.desc, core.Scope()) + main_program.desc, scope) # test for cost_info cost_info = standaloneexecutor.dry_run( - {"a": np.ones([2, 2], dtype="float32")}) + scope, {"a": np.ones([2, 2], dtype="float32")}) self.check_cost_info(cost_info) def check_cost_info(self, cost_info): @@ -132,14 +135,15 @@ def test_standalone_executor_statistics(self): p = core.Place() p.set_place(self.place) + scope = core.Scope() executor = StandaloneExecutor(p, startup_program.desc, - main_program.desc, core.Scope()) + main_program.desc, scope) helper_profiler = profiler.Profiler( targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2)) helper_profiler.start() for i in range(self.iter_n): - executor.run({}, fetch_list) + executor.run(scope, {}, fetch_list) helper_profiler.step() helper_profiler.stop() @@ -251,13 +255,15 @@ def run_new_executor(self): p = core.Place() p.set_place(self.place) + scope = core.Scope() inter_core = StandaloneExecutor(p, startup_program.desc, - main_program.desc, core.Scope()) + main_program.desc, scope) outs = [] for i in range(self.iter_n): outs.append( - np.array(inter_core.run({}, fetch_list)._move_to_list()[0])) + np.array( + inter_core.run(scope, {}, fetch_list)._move_to_list()[0])) return outs From 4ac9d64f2c562e90c74bc956cf34368320a69dd6 Mon Sep 17 00:00:00 2001 From: kuizhiqing Date: Thu, 30 Jun 2022 15:39:39 +0800 Subject: [PATCH 006/250] fix launch exit graceful (#43940) --- .../distributed/launch/context/__init__.py | 4 ++++ .../launch/controllers/controller.py | 18 ++++++++------- .../distributed/launch/controllers/master.py | 2 +- .../distributed/launch/controllers/watcher.py | 4 +++- .../distributed/launch/job/container.py | 6 ++++- python/paddle/distributed/launch/job/pod.py | 22 ++++++++++++++----- 6 files changed, 40 insertions(+), 16 deletions(-) diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py index 902c8189b1720..3e8f0de3e69d5 100644 --- a/python/paddle/distributed/launch/context/__init__.py +++ b/python/paddle/distributed/launch/context/__init__.py @@ -76,6 +76,10 @@ def is_legacy_mode(self): def get_envs(self): return self.envs.copy() + def set_envs(self, env={}): + env = {k: v for k, v in env.items() if isinstance(v, str)} + self.envs.update(env) + def _enable_plugin(self): for pl in plugins.enabled_plugins: pl(self) diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py index 1f43679d748f1..bc628be59dc22 100644 --- a/python/paddle/distributed/launch/controllers/controller.py +++ b/python/paddle/distributed/launch/controllers/controller.py @@ -49,6 +49,8 @@ def __init__(self, ctx): jid=self.ctx.args.job_id) self.pod = Pod() + self.ctx.set_envs({"POD_NAME": self.pod.name}) + self.join_server = None def deploy_pod(self): @@ -104,17 +106,18 @@ def watch(self) -> bool: self.ctx.logger.info("Pod {}".format(status)) self.ctx.logger.error("Container failed !!!\n{}".format(fc[0])) fc[0].tail() - self.pod.stop() if self.ctx.args.elastic_level <= 0: + self.pod.stop(timeout=3) return True else: + self.pod.stop(timeout=30) return False # peer failure if self.ctx.status.is_restarting( ) and self.master.get_status() != self.ctx.status.COMPLETED: - self.pod.stop() + self.pod.stop(timeout=30) return False def stop(self, sigint=None): @@ -123,7 +126,7 @@ def stop(self, sigint=None): self.watcher.stop() self.master.stop() - self.pod.stop(sigint) + self.pod.stop(timeout=30) def finalize(self): self.pod.join() @@ -133,17 +136,16 @@ def finalize(self): sys.exit(self.pod.exit_code) def signal_handler(self, sigint, frame): - self.ctx.logger.info("Terminating with signal {}".format(sigint)) - if hasattr(self, 'sigint'): self.ctx.logger.info("Force quit in 10 seconds...") - time.sleep(11) + self.pod.stop(timeout=10) sys.exit(sigint) + self.ctx.logger.info("Terminating with signal {}".format(sigint)) + self.sigint = sigint self.ctx.status.done() - self.stop(sigint) - time.sleep(1) + self.stop(sigint=sigint) self.ctx.logger.info("Exit with signal {}".format(sigint)) sys.exit(sigint) diff --git a/python/paddle/distributed/launch/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py index 8e8d31f86dd9f..825be9c36888c 100644 --- a/python/paddle/distributed/launch/controllers/master.py +++ b/python/paddle/distributed/launch/controllers/master.py @@ -316,5 +316,5 @@ def get_status(self): def stop(self): if hasattr(self, 'beat_thread'): self.ctx.status.done() - # TODO(kuizhiqing) thread should exit + # daemon thread #self.beat_thread.join() diff --git a/python/paddle/distributed/launch/controllers/watcher.py b/python/paddle/distributed/launch/controllers/watcher.py index 6e8a2cc4e8781..4b8e346e7908f 100644 --- a/python/paddle/distributed/launch/controllers/watcher.py +++ b/python/paddle/distributed/launch/controllers/watcher.py @@ -93,4 +93,6 @@ def _save_gpu_log(self, util_key): def stop(self): if hasattr(self, "proc"): - self.proc.join() + # daemon without join + # self.proc.join() + pass diff --git a/python/paddle/distributed/launch/job/container.py b/python/paddle/distributed/launch/job/container.py index 8f515d9e6f38b..e0f580da0ac45 100644 --- a/python/paddle/distributed/launch/job/container.py +++ b/python/paddle/distributed/launch/job/container.py @@ -131,7 +131,11 @@ def terminate(self, force=False): return self._proc.terminate(force) def wait(self, timeout=None): - self._proc.wait(timeout) + try: + self._proc.wait(timeout) + return True + except Exception: + return False @property def exit_code(self): diff --git a/python/paddle/distributed/launch/job/pod.py b/python/paddle/distributed/launch/job/pod.py index cda400f0a324a..c99b2db547a26 100644 --- a/python/paddle/distributed/launch/job/pod.py +++ b/python/paddle/distributed/launch/job/pod.py @@ -116,14 +116,26 @@ def deploy(self): self._restart += 1 - def stop(self, sigint=0): + def stop(self, sigint=15, timeout=None): for c in self._containers: - force = True if sigint == 9 else False - c.terminate(force) + if isinstance(sigint, int) and timeout is None: + c.send_signal(sigint) + else: + c.terminate() + + if isinstance(timeout, int): + if not self.join(timeout): + for c in self._containers: + c.terminate(force=True) + return False + else: + return True - def join(self): + def join(self, timeout=None): for c in self._containers: - c.wait(None) + if not c.wait(timeout): + return False + return True @property def status(self): From 35ca30090b872a57b19f192def126505d2b3a574 Mon Sep 17 00:00:00 2001 From: zmxdream Date: Thu, 30 Jun 2022 15:52:55 +0800 Subject: [PATCH 007/250] Revert "[GPUPS]Optimize dymf kernel (#43911)" (#43958) * Revert "[GPUPS]Optimize dymf kernel (#43911)" --- .../fleet/heter_ps/hashtable_kernel.cu | 50 ++--- .../framework/fleet/heter_ps/heter_comm_inl.h | 1 - .../fleet/heter_ps/heter_comm_kernel.cu | 174 ++---------------- .../fleet/heter_ps/heter_comm_kernel.h | 33 +--- 4 files changed, 41 insertions(+), 217 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index a7e00bb083f40..92df8d8581a86 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -89,42 +89,30 @@ __global__ void dy_mf_search_kernel(Table* table, char* vals, size_t len, size_t pull_feature_value_size) { - const size_t i = blockIdx.x * blockDim.y + threadIdx.y; - const size_t k = threadIdx.x; + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < len) { auto it = table->find(keys[i]); + if (it != table->end()) { uint64_t offset = i * pull_feature_value_size; FeatureValue* cur = (FeatureValue*)(vals + offset); FeatureValue& input = *(FeatureValue*)(it->second); - char* cur_p = (char*)cur; - char* input_p = (char*)(&input); - int len = 9 + input.mf_dim + 1; - if (k == 3 || k == 6 || k == 7) - *(int*)(cur_p + k * 4) = *(int*)(input_p + k * 4); - else if (k < 8) - *(float*)(cur_p + k * 4) = *(float*)(input_p + k * 4); - else if (k == 8) { - *(uint64_t*)(cur_p + k * 4) = *(uint64_t*)(input_p + k * 4); - } else { - int len_per_thread = (len - 9) / (blockDim.y - 9); - int remain = (len - 9) % (blockDim.y - 9); - int real_len = len_per_thread; - if ((k - 9) < remain) real_len++; - int left = -1, right = -1; - if ((k - 9) < remain) { - left = 9 + (k - 9) * (len_per_thread + 1); - right = left + real_len; - } else { - left = 9 + remain * (len_per_thread + 1) + - (k - 9 - remain) * len_per_thread; - right = left + real_len; - } - for (int j = left; j < right; j++) - *(float*)(cur_p + (j + 1) * 4) = *(float*)(input_p + (j + 1) * 4); + cur->slot = input.slot; + cur->show = input.show; + cur->clk = input.clk; + cur->mf_dim = input.mf_dim; + cur->lr = input.lr; + cur->mf_size = input.mf_size; + cur->cpu_ptr = input.cpu_ptr; + cur->delta_score = input.delta_score; + cur->lr_g2sum = input.lr_g2sum; + for (int j = 0; j < cur->mf_dim + 1; ++j) { + cur->mf[j] = input.mf[j]; } } else { - if (keys[i] != 0) printf("pull miss key: %llu", keys[i]); + if (keys[i] != 0) { + printf("warning::pull miss key: %llu", keys[i]); + } } } } @@ -231,10 +219,8 @@ void HashTable::get(const KeyType* d_keys, if (len == 0) { return; } - dim3 block_dims(32, 32); - const int grid_size = (len - 1) / 32 + 1; - dim3 grid_dims(grid_size); - dy_mf_search_kernel<<>>( + const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; + dy_mf_search_kernel<<>>( container_, d_keys, d_vals, len, pull_feature_value_size_); } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 8952039299d06..ace533cb0c745 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -760,7 +760,6 @@ void HeterComm::dynamic_merge_grad( (char*)d_grads, (char*)d_merge_grads_ptr, uniq_len, - max_mf_dim_, grad_value_size, merger_, stream); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu index 8a13d9abe635d..fd0dd1a72cca1 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.cu @@ -144,106 +144,28 @@ __global__ void dy_mf_fill_shard_grads_kernel(KeyType* d_shard_keys, } } -// optimized version -template <> -__global__ void -dy_mf_fill_shard_grads_kernel( - FeatureKey* d_shard_keys, - FeatureKey* d_keys, - FeaturePushValue* d_shard_grads, - FeaturePushValue* d_grads, - int* idx, - size_t len, - size_t grad_value_size) { - const size_t i = blockIdx.x * blockDim.y + threadIdx.y; - const size_t k = threadIdx.x; - if (i < len) { - if (k == 0) { - d_shard_keys[i] = d_keys[idx[i]]; - } - FeaturePushValue* cur = - (FeaturePushValue*)((char*)d_shard_grads + i * grad_value_size); - FeaturePushValue& input = *( - FeaturePushValue*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); - char* cur_p = (char*)cur; - char* input_p = (char*)(&input); - int len = 5 + input.mf_dim; - if (k == 2 || k == 4) - *(int*)(cur_p + k * 4) = *(int*)(input_p + k * 4); - else if (k < 5) - *(float*)(cur_p + k * 4) = *(float*)(input_p + k * 4); - else { - int len_per_thread = (len - 5) / (blockDim.y - 5); - int remain = (len - 5) % (blockDim.y - 5); - int real_len = len_per_thread; - if ((k - 5) < remain) real_len++; - int left = -1, right = -1; - if ((k - 5) < remain) { - left = 5 + (k - 5) * (len_per_thread + 1); - right = left + real_len; - } else { - left = 5 + remain * (len_per_thread + 1) + - (k - 5 - remain) * len_per_thread; - right = left + real_len; - } - for (int j = left; j < right; j++) - *(float*)(cur_p + j * 4) = *(float*)(input_p + j * 4); - } - } -} - -__global__ void merge_gradients_basic_kernel(const uint32_t* offset, - const uint32_t* fea_num, - const uint32_t* index, - const char* input, - char* output, - int n, - size_t grad_value_size, - DynamicGradMerger& merger) { +__global__ void merge_gradients_kernel(const uint32_t* offset, + const uint32_t* fea_num, + const uint32_t* index, + const char* input, + char* output, + int n, + size_t grad_value_size, + DynamicGradMerger& merger_) { const size_t i = blockIdx.x * blockDim.x + threadIdx.x; if (i < n) { uint32_t start = offset[i]; uint32_t num = fea_num[i]; int ori_index = index[start]; - FeaturePushValue& lhs = *(FeaturePushValue*)(output + i * grad_value_size); + FeaturePushValue& out = *(FeaturePushValue*)(output + i * grad_value_size); FeaturePushValue& in = *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); - merger.update_basic(lhs, in); + merger_.update_one(out, in); for (int j = 1; j < num; ++j) { ori_index = index[start + j]; FeaturePushValue& rhs = *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); - merger.merge_basic(lhs, rhs); - } - } -} - -__global__ void merge_gradients_embedx_kernel(const uint32_t* offset, - const uint32_t* fea_num, - const uint32_t* index, - const char* input, - char* output, - int n, - size_t grad_dim, - size_t grad_value_size, - DynamicGradMerger& merger) { - const size_t i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < n) { - size_t value_idx = i / grad_dim; - size_t field_idx = i % grad_dim; - uint32_t start = offset[value_idx]; - uint32_t num = fea_num[value_idx]; - int ori_index = index[start]; - FeaturePushValue& in = - *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); - FeaturePushValue& lhs = - *(FeaturePushValue*)(output + value_idx * grad_value_size); - merger.update_embedx(lhs, in, field_idx); - for (int j = 1; j < num; ++j) { - int ori_index = index[start + j]; - FeaturePushValue& rhs = - *(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); - merger.merge_embedx(lhs, rhs, field_idx); + merger_.merge_one(out, rhs); } } } @@ -262,49 +184,6 @@ __global__ void dy_mf_fill_dvals_kernel(ValType* d_shard_vals, } } -// optimized version -template <> -__global__ void dy_mf_fill_dvals_kernel( - FeatureValue* d_shard_vals, - FeatureValue* d_vals, - int* idx, - size_t len, - size_t val_size) { - const size_t i = blockIdx.x * blockDim.y + threadIdx.y; - const size_t k = threadIdx.x; - if (i < len) { - uint64_t new_offset = uint64_t(idx[i]) * val_size; - FeatureValue* cur = (FeatureValue*)((char*)d_vals + new_offset); - FeatureValue& input = *(FeatureValue*)((char*)d_shard_vals + i * val_size); - char* cur_p = (char*)cur; - char* input_p = (char*)(&input); - int len = 9 + input.mf_dim + 1; - if (k == 3 || k == 6 || k == 7) - *(int*)(cur_p + k * 4) = *(int*)(input_p + k * 4); - else if (k < 8) - *(float*)(cur_p + k * 4) = *(float*)(input_p + k * 4); - else if (k == 8) { - *(uint64_t*)(cur_p + k * 4) = *(uint64_t*)(input_p + k * 4); - } else { - int len_per_thread = (len - 9) / (blockDim.x - 9); - int remain = (len - 9) % (blockDim.y - 9); - int real_len = len_per_thread; - if ((k - 9) < remain) real_len++; - int left = -1, right = -1; - if ((k - 9) < remain) { - left = 9 + (k - 9) * (len_per_thread + 1); - right = left + real_len; - } else { - left = 9 + remain * (len_per_thread + 1) + - (k - 9 - remain) * len_per_thread; - right = left + real_len; - } - for (int j = left; j < right; j++) - *(float*)(cur_p + (j + 1) * 4) = *(float*)(input_p + (j + 1) * 4); - } - } -} - // cuda implemention of heter_comm_kernel.h template void HeterCommKernel::fill_idx(T* idx, @@ -442,12 +321,9 @@ void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys, long long len, size_t grad_value_size, const StreamType& stream) { - // int grid_size = (len - 1) / block_size_ + 1; + int grid_size = (len - 1) / block_size_ + 1; size_t c_len = (size_t)len; - dim3 block_dims(32, 32); - const size_t grid_size = (len - 1) / 32 + 1; - dim3 grid_dims(grid_size); - dy_mf_fill_shard_grads_kernel<<>>( + dy_mf_fill_shard_grads_kernel<<>>( d_shard_keys, d_keys, d_shard_grads, @@ -464,26 +340,12 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset, const char* input, char* output, int n, - size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_, const StreamType& stream) { int grid_size = (n - 1) / block_size_ + 1; - merge_gradients_basic_kernel<<>>( + merge_gradients_kernel<<>>( offset, fea_num, index, input, output, n, grad_value_size, merger_); - if (grad_dim > 0) { - int grid_size2 = (n * grad_dim - 1) / block_size_ + 1; - merge_gradients_embedx_kernel<<>>( - offset, - fea_num, - index, - input, - output, - n * grad_dim, - grad_dim, - grad_value_size, - merger_); - } } template @@ -493,12 +355,9 @@ void HeterCommKernel::dy_mf_fill_dvals(ValType* d_shard_vals, long long len, size_t val_size, const StreamType& stream) { - // int grid_size = (len - 1) / block_size_ + 1; + int grid_size = (len - 1) / block_size_ + 1; size_t c_len = (size_t)len; - dim3 block_dims(32, 32); - const size_t grid_size_ = (len - 1) / 32 + 1; - dim3 grid_dims(grid_size_); - dy_mf_fill_dvals_kernel<<>>( + dy_mf_fill_dvals_kernel<<>>( d_shard_vals, d_vals, idx, c_len, val_size); } @@ -628,7 +487,6 @@ template void HeterCommKernel::merge_gradient( const char* input, char* output, int n, - size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_, const cudaStream_t& stream); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h index 6859161a5fe48..d1555dc2e0919 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_kernel.h @@ -42,41 +42,23 @@ struct DynamicGradMerger { } template - __device__ __forceinline__ void update_basic(T& output, const T& input) { + __device__ __forceinline__ void update_one(T& output, const T& input) { output.slot = input.slot; output.show = input.show; output.clk = input.clk; output.mf_dim = input.mf_dim; output.lr_g = input.lr_g; - // for (int i = 0; i < output.mf_dim; ++i) { - // output.mf_g[i] = input.mf_g[i]; - //} + for (int i = 0; i < output.mf_dim; ++i) { + output.mf_g[i] = input.mf_g[i]; + } } template - __device__ __forceinline__ void merge_basic(T& output, const T& input) { + __device__ __forceinline__ void merge_one(T& output, const T& input) { output.show += input.show; output.clk += input.clk; output.lr_g += input.lr_g; - // for (int i = 0; i < input.mf_dim; ++i) { - // output.mf_g[i] += input.mf_g[i]; - //} - } - - template - __device__ __forceinline__ void update_embedx(T& output, - const T& input, - size_t embedx_id) { - if (embedx_id < output.mf_dim) { - output.mf_g[embedx_id] = input.mf_g[embedx_id]; - } - } - - template - __device__ __forceinline__ void merge_embedx(T& output, - const T& input, - size_t embedx_id) { - if (embedx_id < output.mf_dim) { - output.mf_g[embedx_id] += input.mf_g[embedx_id]; + for (int i = 0; i < input.mf_dim; ++i) { + output.mf_g[i] += input.mf_g[i]; } } }; @@ -183,7 +165,6 @@ class HeterCommKernel { const char* input, char* output, int n, - size_t grad_dim, size_t grad_value_size, DynamicGradMerger& merger_, const StreamType& stream); From 52d43ca2905c2df7ac9e1a99b06eec6f5835d3e4 Mon Sep 17 00:00:00 2001 From: chenjian Date: Thu, 30 Jun 2022 16:42:22 +0800 Subject: [PATCH 008/250] Add statistic code for memory (#43960) * add code * add unit test --- paddle/fluid/platform/profiler.cc | 305 +++++++++++++++--- paddle/fluid/platform/profiler/mem_tracing.h | 12 + .../unittests/test_profiler_statistic.py | 39 +++ python/paddle/profiler/profiler_statistic.py | 153 ++++++++- 4 files changed, 472 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index ec33e9e819869..38471251ff4a1 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -308,6 +308,10 @@ RecordOpInfoSupplement::RecordOpInfoSupplement( PosixInNsec(), type, input_shapes, dtypes, callstack); } +std::map>> + RecordMemEvent::size_cache; +std::map> + RecordMemEvent::has_initialized; RecordMemEvent::RecordMemEvent(const void *ptr, const phi::Place &place, size_t size, @@ -323,17 +327,75 @@ RecordMemEvent::RecordMemEvent(const void *ptr, uint64_t peak_reserved = 0; // 0 means keep the same as before if (platform::is_cpu_place(place) || platform::is_cuda_pinned_place(place)) { - current_allocated = - HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); - peak_allocated = - HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true; + } else { + current_allocated = + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0] = + current_allocated; + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2] = + peak_allocated; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + } + } else { - current_allocated = - DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); - peak_allocated = - DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true; + } else { + current_allocated = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] = + current_allocated; + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] = + peak_allocated; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + } } - platform::MemEvenRecorder::Instance().PushMemRecord(ptr, place, size, @@ -349,17 +411,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr, uint64_t peak_allocated = 0; // 0 means keep the same as before if (platform::is_cpu_place(place) || platform::is_cuda_pinned_place(place)) { - current_reserved = - HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); - peak_reserved = - HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true; + } else { + current_reserved = + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1] = + current_reserved; + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3] = + peak_reserved; + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + } } else { - current_reserved = - DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); - peak_reserved = - DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true; + } else { + current_reserved = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] = + current_reserved; + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] = + peak_reserved; + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + } } - platform::MemEvenRecorder::Instance().PushMemRecord(ptr, place, size, @@ -375,17 +494,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr, uint64_t peak_reserved = 0; // 0 means keep the same as before if (platform::is_cpu_place(place) || platform::is_cuda_pinned_place(place)) { - current_allocated = - HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); - peak_allocated = - HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true; + } else { + current_allocated = + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0] = + current_allocated; + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2] = + peak_allocated; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + } } else { - current_allocated = - DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); - peak_allocated = - DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true; + } else { + current_allocated = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0] = + current_allocated; + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2] = + peak_allocated; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + } } - platform::MemEvenRecorder::Instance().PopMemRecord(ptr, place, size, @@ -401,17 +577,74 @@ RecordMemEvent::RecordMemEvent(const void *ptr, uint64_t peak_allocated = 0; // 0 means keep the same as before if (platform::is_cpu_place(place) || platform::is_cuda_pinned_place(place)) { - current_reserved = - HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); - peak_reserved = - HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()].push_back( + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["cpu"][place.GetDeviceId()] = true; + } else { + current_reserved = + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][1] = + current_reserved; + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][3] = + peak_reserved; + current_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][0]; + peak_allocated = + RecordMemEvent::size_cache["cpu"][place.GetDeviceId()][2]; + } } else { - current_reserved = - DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); - peak_reserved = - DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + if (RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] == + false) { + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId())); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()].push_back( + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId())); + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + current_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + peak_reserved = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3]; + RecordMemEvent::has_initialized["gpu"][place.GetDeviceId()] = true; + } else { + current_reserved = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][1] = + current_reserved; + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][3] = + peak_reserved; + current_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][0]; + peak_allocated = + RecordMemEvent::size_cache["gpu"][place.GetDeviceId()][2]; + } } - platform::MemEvenRecorder::Instance().PopMemRecord(ptr, place, size, diff --git a/paddle/fluid/platform/profiler/mem_tracing.h b/paddle/fluid/platform/profiler/mem_tracing.h index 3d3508c7bd570..5b2a2391c2e79 100644 --- a/paddle/fluid/platform/profiler/mem_tracing.h +++ b/paddle/fluid/platform/profiler/mem_tracing.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/platform/place.h" @@ -37,6 +38,17 @@ class RecordMemEvent { const Place& place, size_t size, const TracerMemEventType type = TracerMemEventType::Allocate); + + // size_cache: In the outer map, key is device type, 'cpu' or 'gpu', and in + // the inner map, key is device ip. + // Values record memory sizes for current_allocated, current_reserved, + // peak_allocated and peak_reserved. + // has_initialized: Flags to denote whether memory cache for some device has + // collected once. + + static std::map>> + size_cache; + static std::map> has_initialized; }; } // namespace platform diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py index e5463b1a90d59..6481e0f825df1 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py +++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py @@ -16,6 +16,7 @@ import paddle import paddle.profiler as profiler +import paddle.profiler.profiler_statistic as profiler_statistic class HostPythonNode: @@ -30,6 +31,7 @@ def __init__(self, name, type, start_ns, end_ns, process_id, thread_id): self.children_node = [] self.runtime_node = [] self.device_node = [] + self.mem_node = [] class DevicePythonNode: @@ -45,6 +47,22 @@ def __init__(self, name, type, start_ns, end_ns, device_id, context_id, self.stream_id = stream_id +class MemPythonNode: + def __init__(self, timestamp_ns, addr, type, process_id, thread_id, increase_bytes, place, current_allocated, \ + current_reserved, peak_allocated, peak_reserved): + self.timestamp_ns = timestamp_ns + self.addr = addr + self.type = type + self.process_id = process_id + self.thread_id = thread_id + self.increase_bytes = increase_bytes + self.place = place + self.current_allocated = current_allocated + self.current_reserved = current_reserved + self.peak_allocated = peak_allocated + self.peak_reserved = peak_reserved + + class TestProfilerStatistic(unittest.TestCase): def test_statistic_case1(self): @@ -89,6 +107,9 @@ def test_statistic_case1(self): conv2d_compute = HostPythonNode('conv2d::compute', profiler.TracerEventType.OperatorInner, 30, 40, 1000, 1001) + conv2d_compute.mem_node.append( + MemPythonNode(33, 0, profiler_statistic.TracerMemEventType.Allocate, + 1000, 1001, 20, 'place(gpu:0)', 200, 200, 800, 800)) conv2d_launchkernel = HostPythonNode( 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35, 1000, 1001) @@ -211,6 +232,24 @@ def test_statistic_case1(self): self.assertEqual( event_summary.memory_manipulation_items['AsyncMemcpy']. general_gpu_time, 60) + self.assertEqual( + statistic_data.memory_summary.allocated_items['place(gpu:0)'] + ['conv2d'].allocation_count, 1) + self.assertEqual( + statistic_data.memory_summary.allocated_items['place(gpu:0)'] + ['conv2d'].allocation_size, 20) + self.assertEqual( + statistic_data.memory_summary.allocated_items['place(gpu:0)'] + ['conv2d'].increase_size, 20) + self.assertEqual( + statistic_data.memory_summary.allocated_items['place(gpu:0)'] + ['conv2d'].increase_size, 20) + self.assertEqual( + statistic_data.memory_summary. + peak_allocation_values['place(gpu:0)'], 800) + self.assertEqual( + statistic_data.memory_summary.peak_reserved_values['place(gpu:0)'], + 800) print( profiler.profiler_statistic._build_table( statistic_data, diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index daa6925c4b907..f33335c907d7a 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -15,7 +15,7 @@ from enum import Enum import re -from paddle.fluid.core import TracerEventType +from paddle.fluid.core import TracerEventType, TracerMemEventType from .statistic_helper import * @@ -603,6 +603,83 @@ def add_kernel_item(self, root_node): self.kernel_items[name].add_item(device_node) +class MemorySummary: + r""" + Analyse memory events in profiling data. + """ + + class MemoryItem: + + def __init__(self, event_name, place, memory_type='Allocated'): + self.event_name = event_name + self.place = place + self.allocation_count = 0 + self.free_count = 0 + self.allocation_size = 0 + self.free_size = 0 + self.increase_size = 0 + self.memory_type = memory_type + + def add_memory_record(self, size, allocation_type): + if allocation_type == TracerMemEventType.Allocate or allocation_type == TracerMemEventType.ReservedAllocate: + self.allocation_count += 1 + self.allocation_size += size + + elif allocation_type == TracerMemEventType.Free or allocation_type == TracerMemEventType.ReservedFree: + self.free_count += 1 + self.free_size -= size # size is sign(-) when free. + + else: + print("No corresponding type.") + self.increase_size = self.allocation_size - self.free_size + + def __init__(self): + self.allocated_items = collections.defaultdict( + dict) # for memory summary, device type: event + self.reserved_items = collections.defaultdict( + dict) # for memory summary, device type: event + self.peak_allocation_values = collections.defaultdict(int) + self.peak_reserved_values = collections.defaultdict(int) + + def _analyse_node_memory(self, event_name, node): + for memnode in node.mem_node: # self mem node + if memnode.type == TracerMemEventType.Allocate or memnode.type == TracerMemEventType.Free: + if event_name not in self.allocated_items[memnode.place]: + self.allocated_items[ + memnode.place][event_name] = MemorySummary.MemoryItem( + event_name, memnode.place, 'Allocated') + self.allocated_items[ + memnode.place][event_name].add_memory_record( + memnode.increase_bytes, memnode.type) + elif memnode.type == TracerMemEventType.ReservedAllocate or memnode.type == TracerMemEventType.ReservedFree: + if event_name not in self.reserved_items[memnode.place]: + self.reserved_items[ + memnode.place][event_name] = MemorySummary.MemoryItem( + event_name, memnode.place, 'Reserved') + self.reserved_items[ + memnode.place][event_name].add_memory_record( + memnode.increase_bytes, memnode.type) + self.peak_allocation_values[memnode.place] = max( + self.peak_allocation_values[memnode.place], + memnode.peak_allocated) + self.peak_reserved_values[memnode.place] = max( + self.peak_reserved_values[memnode.place], memnode.peak_reserved) + + def parse(self, nodetrees): + r""" + Analyse memory event in the nodetress. + """ + thread2hostnodes = traverse_tree(nodetrees) + for threadid, host_nodes in thread2hostnodes.items(): + for host_node in host_nodes[1:]: #skip root node + if host_node.type == TracerEventType.OperatorInner: + continue + if host_node.type == TracerEventType.Operator: + for child in host_node.children_node: + self._analyse_node_memory(host_node.name, child) + self._analyse_node_memory(host_node.name, host_node) + + class StatisticData: r""" Hold all analysed results. @@ -614,9 +691,11 @@ def __init__(self, node_trees, extra_info): self.time_range_summary = TimeRangeSummary() self.event_summary = EventSummary() self.distributed_summary = DistributedSummary() + self.memory_summary = MemorySummary() self.time_range_summary.parse(node_trees) self.event_summary.parse(node_trees) self.distributed_summary.parse(node_trees) + self.memory_summary.parse(node_trees) def _build_table(statistic_data, @@ -1498,4 +1577,76 @@ def format_ratio(ratio, indent=0): append('') append('') + ###### Print Memory Summary Report ###### + if statistic_data.memory_summary.allocated_items or statistic_data.memory_summary.reserved_items: + for device_type, memory_events in statistic_data.memory_summary.allocated_items.items( + ): + all_row_values = [] + sorted_items = sorted(memory_events.items(), + key=lambda x: x[1].increase_size, + reverse=True) + + for event_name, item in sorted_items: + row_values = [ + event_name, item.memory_type, item.allocation_count, + item.free_count, item.allocation_size, item.free_size, + item.increase_size + ] + all_row_values.append(row_values) + + sorted_reserved_items = sorted(statistic_data.memory_summary. + reserved_items[device_type].items(), + key=lambda x: x[1].increase_size, + reverse=True) + for event_name, item in sorted_reserved_items: + row_values = [ + event_name, item.memory_type, item.allocation_count, + item.free_count, item.allocation_size, item.free_size, + item.increase_size + ] + all_row_values.append(row_values) + + # Calculate the column width + headers = [ + 'Name', 'Type', 'Allocation Count', 'Free Count', + 'Allocation Size', 'Free Size', 'Increased Size' + ] + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + name_column_width = 50 + number_column_width = 15 + add_column(name_column_width) + add_column(12) + add_column(number_column_width) + add_column(number_column_width) + add_column(number_column_width) + add_column(number_column_width) + add_column(number_column_width) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + append( + add_title(line_length, + "Memory Summary - {}".format(device_type))) + append('Peak Allocated Memory: {}'.format( + statistic_data.memory_summary. + peak_allocation_values[device_type])) + append('Peak Reserved Memory: {}'.format( + statistic_data.memory_summary.peak_reserved_values[device_type]) + ) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + for row_values in all_row_values: + if isinstance(row_values, str): + append(add_title(line_length, row_values)) + else: + append(row_format.format(*row_values)) + append('') + append('') + return ''.join(result) From 59d50468e6212973e35b3b340cd1a09143fbc8bb Mon Sep 17 00:00:00 2001 From: zhaoying9105 Date: Thu, 30 Jun 2022 18:33:31 +0800 Subject: [PATCH 009/250] [MLU] add exp and exp_grad kernel (#43852) --- paddle/fluid/operators/activation_op_mlu.cc | 56 +++++++++ .../tests/unittests/mlu/test_exp_op_mlu.py | 114 ++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_exp_op_mlu.py diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc index 4d6fe0d2b3830..e19ce87e7c8ec 100644 --- a/paddle/fluid/operators/activation_op_mlu.cc +++ b/paddle/fluid/operators/activation_op_mlu.cc @@ -208,6 +208,54 @@ class LogMLUKernel : public framework::OpKernel { } }; +template +class ExpMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + output->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_desc(*input); + MLUCnnlTensorDesc output_desc(*output); + cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION; + + MLUCnnl::Exp(ctx, + prefer, + input_desc.get(), + GetBasePtr(input), + output_desc.get(), + GetBasePtr(output)); + } +}; + +template +class ExpGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); + MLUCnnlTensorDesc dout_desc(*dout); + MLUCnnlTensorDesc dx_desc(*dx); + MLUCnnlTensorDesc out_desc(*out); + + MLUCnnlOpTensorDesc op_tensor_desc( + CNNL_OP_TENSOR_MUL, ToCnnlDataType(), CNNL_NOT_PROPAGATE_NAN); + + MLUCnnl::OpTensor(ctx, + op_tensor_desc.get(), + dout_desc.get(), + GetBasePtr(dout), + out_desc.get(), + GetBasePtr(out), + dx_desc.get(), + GetBasePtr(dx), + ToCnnlDataType()); + } +}; + } // namespace operators } // namespace paddle @@ -303,3 +351,11 @@ REGISTER_OP_MLU_KERNEL( log10, ops::LogMLUKernel, ops::LogMLUKernel); + +REGISTER_OP_MLU_KERNEL(exp, + ops::ExpMLUKernel, + ops::ExpMLUKernel); + +REGISTER_OP_MLU_KERNEL(exp_grad, + ops::ExpGradMLUKernel, + ops::ExpGradMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_exp_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_exp_op_mlu.py new file mode 100644 index 0000000000000..70c001c69cf9d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_exp_op_mlu.py @@ -0,0 +1,114 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys + +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +class TestExp(OpTest): + + def setUp(self): + self.set_mlu() + self.op_type = "exp" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.rand(20, 5).astype(self.dtype) + out = np.exp(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + +class TestExpFp16(OpTest): + + def setUp(self): + self.set_mlu() + self.op_type = "exp" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.rand(20, 5).astype(self.dtype) + out = np.exp(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestExpNeg(OpTest): + + def setUp(self): + self.set_mlu() + self.op_type = "exp" + self.place = paddle.MLUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.random([20, 5]).astype(self.dtype) + x -= 1 + out = np.exp(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +if __name__ == '__main__': + unittest.main() From d29a121418976aab11d095f6f31dbc80136ac4de Mon Sep 17 00:00:00 2001 From: Lux et Veritas <1004239791@qq.com> Date: Thu, 30 Jun 2022 18:40:16 +0800 Subject: [PATCH 010/250] [MLU] add mlu kernel for masked_select (#43816) --- .../fluid/operators/masked_select_op_mlu.cc | 204 ++++++++++++++++++ paddle/fluid/operators/mlu/mlu_baseop.cc | 13 ++ paddle/fluid/operators/mlu/mlu_baseop.h | 9 + .../mlu/test_masked_select_op_mlu.py | 169 +++++++++++++++ 4 files changed, 395 insertions(+) create mode 100644 paddle/fluid/operators/masked_select_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_masked_select_op_mlu.py diff --git a/paddle/fluid/operators/masked_select_op_mlu.cc b/paddle/fluid/operators/masked_select_op_mlu.cc new file mode 100644 index 0000000000000..279096b762ca8 --- /dev/null +++ b/paddle/fluid/operators/masked_select_op_mlu.cc @@ -0,0 +1,204 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class MaskedSelectedMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto input = ctx.Input("X"); + auto mask = ctx.Input("Mask"); + auto out = ctx.Output("Y"); + + auto input_dim = input->dims(); + auto mask_dim = mask->dims(); + PADDLE_ENFORCE_EQ( + input_dim, + mask_dim, + platform::errors::InvalidArgument( + "The dim size of input and mask in OP(masked_selected) " + "must be equal, but got input dim:(%ld), mask dim: " + "(%ld). Please check input " + "value.", + input_dim, + mask_dim)); + + Tensor number(framework::TransToPhiDataType(VT::INT32)); + void* number_ptr = number.mutable_data({1}, ctx.GetPlace()); + + out->Resize(mask->dims()); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_desc(*input); + MLUCnnlTensorDesc mask_desc(*mask); + MLUCnnlTensorDesc out_desc(*out); + MLUCnnl::Mask(ctx, + CNNL_MASKED_SELECT, + input_desc.get(), + GetBasePtr(input), + mask_desc.get(), + GetBasePtr(mask), + nullptr, + nullptr, + out_desc.get(), + GetBasePtr(out), + static_cast(number_ptr)); + } +}; + +template +class MaskedSelectedGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto mask = ctx.Input("Mask"); + auto y_grad = ctx.Input(framework::GradVarName("Y")); + auto x_grad = ctx.Output(framework::GradVarName("X")); + + auto& dev_ctx = + ctx.template device_context(); + Tensor mask_int32, out_size; + std::vector out_size_vec; + mask_int32.mutable_data(mask->dims(), ctx.GetPlace()); + out_size.mutable_data({1}, ctx.GetPlace()); + + MLUCnnlTensorDesc mask_desc(*mask); + MLUCnnlTensorDesc mask_int32_desc(mask_int32); + MLUCnnlTensorDesc out_size_desc(out_size); + auto cast_type = GetCastDataType(mask->dtype(), DataType::INT32); + MLUCnnl::Cast(ctx, + cast_type, + mask_desc.get(), + GetBasePtr(mask), + mask_int32_desc.get(), + GetBasePtr(&mask_int32)); + + auto mask_int32_dim = phi::vectorize(mask_int32.dims()); + std::vector reduce_dims; + for (size_t i = 0; i < mask_int32_dim.size(); i++) { + reduce_dims.push_back(static_cast(i)); + } + + std::string reduce_name = "reduce_sum"; + cnnlReduceOp_t reduce_op = GetMLUCnnlReduceOp(reduce_name); + MLUCnnlReduceDesc reduce_desc(reduce_dims, + reduce_op, + ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN, + CNNL_REDUCE_NO_INDICES, + CNNL_32BIT_INDICES); + + MLUCnnl::Reduce(ctx, + true, + reduce_desc.get(), + nullptr, + mask_int32_desc.get(), + GetBasePtr(&mask_int32), + 0, + nullptr, + nullptr, + out_size_desc.get(), + GetBasePtr(&out_size)); + + paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec); + dev_ctx.Wait(); + + Tensor mask_int32_tmp; + mask_int32_tmp.ShareDataWith(mask_int32); + mask_int32_tmp.Resize({mask_int32.numel()}); + Tensor topk_v2_out(framework::TransToPhiDataType(VT::INT32)), + indices_int32(framework::TransToPhiDataType(VT::INT32)); + topk_v2_out.mutable_data({mask_int32.numel()}, ctx.GetPlace()); + indices_int32.mutable_data({mask_int32.numel()}, ctx.GetPlace()); + + MLUCnnlTensorDesc topk_v2_out_desc(topk_v2_out); + MLUCnnlTensorDesc indices_int32_desc(indices_int32); + MLUCnnlTensorDesc mask_int32_tmp_desc(mask_int32_tmp); + + const int dim = 0; + MLUCnnl::TopK(ctx, + mask_int32.numel(), + dim, + true, + false, + mask_int32_tmp_desc.get(), + GetBasePtr(&mask_int32_tmp), + topk_v2_out_desc.get(), + GetBasePtr(&topk_v2_out), + indices_int32_desc.get(), + GetBasePtr(&indices_int32)); + + auto stream = ctx.template device_context().stream(); + + Tensor indices_int32_out; + indices_int32_out.mutable_data({out_size_vec[0]}, ctx.GetPlace()); + memory::Copy(ctx.GetPlace(), + GetBasePtr(&indices_int32_out), + ctx.GetPlace(), + GetBasePtr(&indices_int32), + out_size_vec[0] * sizeof(int32_t), + stream); + + Tensor y_grad_tmp_out; + y_grad_tmp_out.mutable_data({out_size_vec[0]}, ctx.GetPlace()); + MLUCnnlTensorDesc y_grad_tmp_out_desc(y_grad_tmp_out); + memory::Copy(ctx.GetPlace(), + GetBasePtr(&y_grad_tmp_out), + ctx.GetPlace(), + GetBasePtr(y_grad), + out_size_vec[0] * sizeof(T), + stream); + + Tensor indices_int32_tmp; + indices_int32_tmp.ShareDataWith(indices_int32_out); + indices_int32_tmp.Resize({out_size_vec[0], 1}); + MLUCnnlTensorDesc indices_int32_tmp_desc(indices_int32_tmp); + + const cnnlScatterNdMode_t mode = CNNL_SCATTERND_UPDATE; + x_grad->Resize({x_grad->numel()}); + x_grad->mutable_data(ctx.GetPlace()); + MLUCnnlTensorDesc x_grad_desc(*x_grad); + MLUCnnl::ScatterNd(ctx, + mode, + indices_int32_tmp_desc.get(), + GetBasePtr(&indices_int32_tmp), + y_grad_tmp_out_desc.get(), + GetBasePtr(&y_grad_tmp_out), + nullptr, + nullptr, + x_grad_desc.get(), + GetBasePtr(x_grad)); + x_grad->Resize(mask->dims()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(masked_select, + ops::MaskedSelectedMLUKernel, + ops::MaskedSelectedMLUKernel, + ops::MaskedSelectedMLUKernel); + +REGISTER_OP_MLU_KERNEL(masked_select_grad, + ops::MaskedSelectedGradMLUKernel, + ops::MaskedSelectedGradMLUKernel, + ops::MaskedSelectedGradMLUKernel); diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index c0619145ad5ab..972bdefdf02b8 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -2597,6 +2597,19 @@ MLURNNDesc::~MLURNNDesc() { cnnlSign(handle, input_desc, input, output_desc, output)); } +/* static */ void MLUCnnl::IndexSelect(const ExecutionContext& ctx, + const int dim, + cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t index_desc, + const void* index, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlIndexSelect( + handle, dim, input_desc, input, index_desc, index, output_desc, output)); +} + /* static */ void MLUCnnl::IsFinite(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, const void* input, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 9031040ec5598..85f4439c3b974 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -1391,6 +1391,15 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_desc, void* output); + static void IndexSelect(const ExecutionContext& ctx, + const int dim, + cnnlTensorDescriptor_t input_desc, + const void* input, + const cnnlTensorDescriptor_t index_desc, + const void* index, + const cnnlTensorDescriptor_t output_desc, + void* output); + static void IsFinite(const ExecutionContext& ctx, const cnnlTensorDescriptor_t input_desc, const void* input, diff --git a/python/paddle/fluid/tests/unittests/mlu/test_masked_select_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_masked_select_op_mlu.py new file mode 100644 index 0000000000000..7efed0ea4b0f8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_masked_select_op_mlu.py @@ -0,0 +1,169 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import sys + +sys.path.append("..") +import numpy as np +from op_test import OpTest, skip_check_grad_ci +import paddle.fluid as fluid +import paddle + +paddle.enable_static() + + +def np_masked_select(shape, x, mask): + result = np.empty(shape=(0), dtype=x.dtype) + sum = 0 + for index, (ele, ma) in enumerate(zip(np.nditer(x), np.nditer(mask))): + if ma: + sum = sum + 1 + result = np.append(result, ele) + for index, (ele, ma) in enumerate(zip(np.nditer(x), np.nditer(mask))): + if index >= sum: + result = np.append(result, 0) + result = np.reshape(result, shape) + return result + + +class TestMaskedSelectOp(OpTest): + + def setUp(self): + self.init() + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + self.op_type = "masked_select" + self.python_api = paddle.masked_select + x = np.random.random(self.shape).astype('float32') + mask = np.array(np.random.randint(2, size=self.shape, dtype=bool)) + out = np_masked_select(self.shape, x, mask) + self.inputs = {'X': x, 'Mask': mask} + self.outputs = {'Y': out} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Y') + + def init(self): + self.shape = (50, 3) + + +class TestMaskedSelectOp1(TestMaskedSelectOp): + + def init(self): + self.shape = (6, 8, 9, 18) + + +class TestMaskedSelectOp2(TestMaskedSelectOp): + + def init(self): + self.shape = (168, ) + + +@skip_check_grad_ci(reason="get_numeric_gradient not support int32") +class TestMaskedSelectOpInt32(TestMaskedSelectOp): + + def init_dtype(self): + self.dtype = np.int32 + + def test_check_grad(self): + pass + + +class TestMaskedSelectOpFp16(TestMaskedSelectOp): + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_grad(self): + x_grad = self.inputs['Mask'].astype(self.dtype) + x_grad = x_grad * (1 / x_grad.size) + self.check_grad_with_place(self.place, ['X'], + 'Y', + user_defined_grads=[x_grad]) + + +class TestMaskedSelectAPI(unittest.TestCase): + + def test_imperative_mode(self): + paddle.disable_static() + shape = (88, 6, 8) + np_x = np.random.random(shape).astype('float32') + np_mask = np.array(np.random.randint(2, size=shape, dtype=bool)) + x = paddle.to_tensor(np_x) + mask = paddle.to_tensor(np_mask) + out = paddle.masked_select(x, mask) + np_out = np_masked_select(shape, np_x, np_mask) + self.assertEqual(np.allclose(out.numpy(), np_out), True) + paddle.enable_static() + + def test_static_mode(self): + shape = [8, 9, 6] + x = paddle.fluid.data(shape=shape, dtype='float32', name='x') + mask = paddle.fluid.data(shape=shape, dtype='bool', name='mask') + np_x = np.random.random(shape).astype('float32') + np_mask = np.array(np.random.randint(2, size=shape, dtype=bool)) + + out = paddle.masked_select(x, mask) + np_out = np_masked_select(shape, np_x, np_mask) + + exe = paddle.static.Executor(place=paddle.device.MLUPlace(0)) + + res = exe.run(paddle.static.default_main_program(), + feed={ + "x": np_x, + "mask": np_mask + }, + fetch_list=[out]) + self.assertEqual(np.allclose(res, np_out), True) + + +class TestMaskedSelectError(unittest.TestCase): + + def test_error(self): + with paddle.static.program_guard(paddle.static.Program(), + paddle.static.Program()): + + shape = [8, 9, 6] + x = paddle.fluid.data(shape=shape, dtype='float32', name='x') + mask = paddle.fluid.data(shape=shape, dtype='bool', name='mask') + mask_float = paddle.fluid.data(shape=shape, + dtype='float32', + name='mask_float') + np_x = np.random.random(shape).astype('float32') + np_mask = np.array(np.random.randint(2, size=shape, dtype=bool)) + + def test_x_type(): + paddle.masked_select(np_x, mask) + + self.assertRaises(TypeError, test_x_type) + + def test_mask_type(): + paddle.masked_select(x, np_mask) + + self.assertRaises(TypeError, test_mask_type) + + def test_mask_dtype(): + paddle.masked_select(x, mask_float) + + self.assertRaises(TypeError, test_mask_dtype) + + +if __name__ == '__main__': + unittest.main() From 23f18f46b21d5ebaa4ce6b529bfe80d959284b3e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 30 Jun 2022 19:17:49 +0800 Subject: [PATCH 011/250] [jit] save multi program into one param and seperate model (#43686) * save multi program into one param and seperate model * export class property --- .../dygraph_to_static/program_translator.py | 28 ++++- python/paddle/fluid/dygraph/jit.py | 108 ++++++++++++------ .../tests/unittests/test_jit_save_load.py | 59 ++++++++++ 3 files changed, 161 insertions(+), 34 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py index 49a218412c92d..43ce1fae16fc2 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py @@ -252,9 +252,11 @@ def __init__(self, function, input_spec=None, **kwargs): **kwargs(dict): other arguments like `build_strategy` et.al. """ # save the instance `self` while decorating a method of class. + if inspect.ismethod(function): self._dygraph_function = getattr(function, '__func__') self._class_instance = getattr(function, '__self__') + self._class_instance._original_funcs[ function.__name__] = self._dygraph_function else: @@ -272,6 +274,13 @@ def __init__(self, function, input_spec=None, **kwargs): self._cuda_graph_capture_mode = "" self._cuda_graph_pool_id = 0 + self._property = kwargs.get("property", False) + + @property + def is_property(self): + # whether is class proproty to be exported. + return self._property + def train(self): if isinstance(self._class_instance, layers.Layer) and self._class_instance.training == False: @@ -325,7 +334,8 @@ def forward(self, x, y): return self._descriptor_cache[instance] def _clone(self): - return self.__class__(self._dygraph_function, self._input_spec) + return self.__class__(self._dygraph_function, self._input_spec, + **self._kwargs) def __call__(self, *args, **kwargs): """ @@ -338,6 +348,8 @@ def __call__(self, *args, **kwargs): Return: Outputs of decorated function. """ + if self._property: + return self._call_dygraph_function(*args, **kwargs) # 1. call dygraph function directly if not enable `declarative` if not self._program_trans.enable_to_static: @@ -417,6 +429,15 @@ def _call_dygraph_function(self, *args, **kwargs): return dygraph_function(*args, **kwargs) + def _raise_when_property(self): + """raise RuntimeError when property=True + + Raises: + RuntimeError: can not call this func when property=True + """ + if self.is_property: + raise RuntimeError("Can not call the func when property=True.") + def get_concrete_program(self, *args, **kwargs): """ Returns traced concrete program and inner executable partial layer. @@ -428,6 +449,7 @@ def get_concrete_program(self, *args, **kwargs): Returns: Traced ConcreteProgram and executable translated Layer. """ + self._raise_when_property() with_hook = kwargs.get("with_hook", False) is_train = kwargs.get("is_train", True) @@ -518,6 +540,7 @@ def concrete_program_specify_input_spec(self, input_spec (list[InputSpec], optional): Describes the input of the translate function. """ + self._raise_when_property() # if specific the `input_spec`, the length of program_cache will always 1, # else, return the last one. cached_program_len = len(self._program_cache) @@ -670,6 +693,7 @@ def inputs(self): """ Returns input tensors of recent converted static program. """ + self._raise_when_property() concrete_program = self.concrete_program inputs = [ var for var in flatten(concrete_program.inputs) @@ -682,6 +706,7 @@ def outputs(self): """ Returns output tensors of recent converted static program. """ + self._raise_when_property() concrete_program = self.concrete_program outputs = [ var for var in flatten(concrete_program.outputs) @@ -695,6 +720,7 @@ def main_program(self): """ Returns recent converted static main program. """ + self._raise_when_property() concrete_program = self.concrete_program main_program = concrete_program.main_program return main_program diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index b6847efab1d68..393f1c1570453 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -160,7 +160,10 @@ def copy_decorator_attrs(original_func, decorated_obj): return decorated_obj -def declarative(function=None, input_spec=None, build_strategy=None): +def declarative(function=None, + input_spec=None, + build_strategy=None, + property=False): """ Converts imperative dygraph APIs into declarative function APIs. Decorator @declarative handles the Program and Executor of static mode and returns @@ -178,6 +181,7 @@ def declarative(function=None, input_spec=None, build_strategy=None): in the computational graph and memory optimization during the execution of the computational graph. For more information about build_strategy, please refer to :code:`paddle.static.BuildStrategy`. The default is None. + property(bool, Optional): whether the fucntion is python property. The default is False. Returns: @@ -215,7 +219,8 @@ def decorated(python_func): decorated_obj=StaticFunction( function=python_func, input_spec=input_spec, - build_strategy=build_strategy)) + build_strategy=build_strategy, + property=property)) return static_layer @@ -304,6 +309,9 @@ def __init__(self): self._program_only = False self.with_hook = False + # if True, multi `StaticFunction` will share params in one file. + self.combine_params = False + @property def output_spec(self): return self._output_spec @@ -371,7 +379,7 @@ def keep_name_table(self, value): def _parse_save_configs(configs): - supported_configs = ['output_spec', "with_hook"] + supported_configs = ['output_spec', "with_hook", "use_combine"] # input check for key in configs: @@ -384,6 +392,7 @@ def _parse_save_configs(configs): inner_config = _SaveLoadConfig() inner_config.output_spec = configs.get('output_spec', None) inner_config.with_hook = configs.get('with_hook', False) + inner_config.combine_params = configs.get("use_combine", False) return inner_config @@ -840,6 +849,9 @@ def fun(inputs): # whether outermost layer has pre/post hook, if does, we need also save # these operators in program. with_hook = configs.with_hook + combine_params = configs.combine_params + if combine_params: + configs._program_only = True scope = core.Scope() extra_var_info = dict() @@ -852,10 +864,21 @@ def fun(inputs): functions = [ layer, ] + + all_vars = set() + property_vals = [] # (value, key) for attr_func in functions: if isinstance(layer, Layer): static_func = getattr(inner_layer, attr_func, None) if isinstance(static_func, StaticFunction): + if static_func.is_property: + # property method to be exported + immediate_val = static_func() + property_vals.append( + (immediate_val, + layer.__class__.__name__ + '.' + attr_func)) + continue + concrete_program = static_func.concrete_program_specify_input_spec( inner_input_spec, with_hook=with_hook) elif 'forward' == attr_func: @@ -875,10 +898,15 @@ def fun(inputs): inner_input_spec = None else: continue - else: # When layer is a function if isinstance(attr_func, StaticFunction): + if attr_func.is_property: + # property method to be exported + immediate_val = attr_func() + property_vals.append((immediate_val, attr_func)) + continue + concrete_program = attr_func.concrete_program_specify_input_spec( inner_input_spec) else: @@ -894,6 +922,7 @@ def fun(inputs): '`jit.save` will only save the `Program`, not the parameters. If you have to save the parameters, please make sure that {} is a member function of `paddle.nn.Layer` and the saved parameters are in `state_dict`' .format(layer)) + # when save multi `StaticFunction`, all `StaticFunction` share params. dygraph_state_dict = None if isinstance(inner_layer, Layer): dygraph_state_dict = inner_layer.to_static_state_dict() @@ -913,35 +942,32 @@ def fun(inputs): state_names_dict[var.name] = structured_name state_var_dict[var.name] = var - # 3. share parameters from Layer to scope & record var info - with dygraph.guard(): - for param_or_buffer in concrete_program.parameters: - # share to scope - if param_or_buffer.type == core.VarDesc.VarType.VOCAB: - scr_tensor = param_or_buffer.value().get_map_tensor() - tgt_var = scope.var(param_or_buffer.name) - tgt_var.set_vocab(scr_tensor) - else: - param_or_buffer_tensor = scope.var( - param_or_buffer.name).get_tensor() - #src_tensor = param_or_buffer.value().get_tensor() - src_tensor = state_var_dict[ - param_or_buffer.name].value().get_tensor() - param_or_buffer_tensor._share_data_with(src_tensor) - # record var info - if param_or_buffer.name not in extra_var_info: - extra_info_dict = dict() - if param_or_buffer.name in state_names_dict: - extra_info_dict[ - 'structured_name'] = state_names_dict[ - param_or_buffer.name] - extra_info_dict[ - 'stop_gradient'] = param_or_buffer.stop_gradient - if isinstance(param_or_buffer, - (ParamBase, EagerParamBase)): - extra_info_dict[ - 'trainable'] = param_or_buffer.trainable - extra_var_info[param_or_buffer.name] = extra_info_dict + # 3. share parameters from Layer to scope & record var info + with dygraph.guard(): + for param_or_buffer in concrete_program.parameters: + # share to scope + if param_or_buffer.type == core.VarDesc.VarType.VOCAB: + scr_tensor = param_or_buffer.value().get_map_tensor() + tgt_var = scope.var(param_or_buffer.name) + tgt_var.set_vocab(scr_tensor) + else: + param_or_buffer_tensor = scope.var( + param_or_buffer.name).get_tensor() + #src_tensor = param_or_buffer.value().get_tensor() + src_tensor = state_var_dict[ + param_or_buffer.name].value().get_tensor() + param_or_buffer_tensor._share_data_with(src_tensor) + # record var info + if param_or_buffer.name not in extra_var_info: + extra_info_dict = dict() + if param_or_buffer.name in state_names_dict: + extra_info_dict['structured_name'] = state_names_dict[ + param_or_buffer.name] + extra_info_dict[ + 'stop_gradient'] = param_or_buffer.stop_gradient + if isinstance(param_or_buffer, (ParamBase, EagerParamBase)): + extra_info_dict['trainable'] = param_or_buffer.trainable + extra_var_info[param_or_buffer.name] = extra_info_dict # 4. build input & output of save_infernece_model # NOTE(chenweihang): [ Get input variables name ] @@ -991,6 +1017,22 @@ def fun(inputs): program_only=configs._program_only, clip_extra=False) + # collect all vars + for var in concrete_program.main_program.list_vars(): + all_vars.add(var) + + # save shared params + if combine_params: + params_filename = file_prefix + INFER_PARAMS_SUFFIX + with scope_guard(scope): + paddle.static.save_vars(Executor(_current_expected_place()), + dirname=model_path, + vars=list( + filter(paddle.fluid.io.is_persistable, + all_vars)), + filename=params_filename) + # TODO: save property + # NOTE(chenweihang): [ Save extra variable info ] # save_inference_model will lose some important variable information, including: # - Variable name and correspondence (when saved variables as one file) diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py index bf5ccf1a854ff..f467fbe4888e6 100644 --- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py @@ -1153,6 +1153,65 @@ def forward(self, x): return self._linear_2(y) +class Net(paddle.nn.Layer): + + def __init__(self): + super(Net, self).__init__() + self.fc1 = paddle.nn.Linear(4, 4) + self.fc2 = paddle.nn.Linear(4, 4) + self.bias = 0.4 + self.flag = paddle.ones([2], dtype="int32") + + @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')]) + def log_softmax(self, input): + return paddle.nn.functional.log_softmax(input, axis=-1) + + @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')]) + def forward(self, x): + out = self.fc1(x) + out = paddle.nn.functional.relu(out) + out = paddle.mean(out) + return out + + @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')]) + def infer(self, input): + out = self.fc2(input) + out = out + self.bias + out = paddle.mean(out) + return out + + # For extra Python float + @paddle.jit.to_static(property=True) + def fbias(self): + return self.bias + 1 + + # For extra Tensor + @paddle.jit.to_static(property=True) + def fflag(self): + return self.flag + + +class TestJitSaveCombine(unittest.TestCase): + + def setUp(self): + # enable dygraph mode + paddle.disable_static() + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def test_save_load_finetune_load(self): + model_path = os.path.join(self.temp_dir.name, + "test_jit_save_combine/model") + + # Use new namespace + with unique_name.guard(): + net = Net() + #save + paddle.jit.save(net, model_path, use_combine=True) + + class LayerLoadFinetune(paddle.nn.Layer): def __init__(self, in_size, out_size, load_path): From 842f363ddc38ee56b50c6763107606197ea1fa0b Mon Sep 17 00:00:00 2001 From: Vigi Zhang Date: Thu, 30 Jun 2022 19:25:01 +0800 Subject: [PATCH 012/250] change email address in policy (#43976) --- SECURITY.md | 2 +- SECURITY_cn.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SECURITY.md b/SECURITY.md index 79bf3353ad4f9..04ccdd8062f51 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -24,7 +24,7 @@ PaddlePaddle security team attaches great importance to the security of the fram ### Reporting vulnerabilities -We encourage responsible disclosure of security issues to PaddlePaddle and please email reports about any security issues you find to security@paddlepaddle.org. +We encourage responsible disclosure of security issues to PaddlePaddle and please email reports about any security issues you find to paddle-security@baidu.com. diff --git a/SECURITY_cn.md b/SECURITY_cn.md index 00b222912d277..68ad6b32176b8 100644 --- a/SECURITY_cn.md +++ b/SECURITY_cn.md @@ -20,7 +20,7 @@ ### 报告安全问题 -我们鼓励向飞桨负责任地披露安全问题,请将所发现的安全问题发送电子邮件到 security@paddlepaddle.org。 +我们鼓励向飞桨负责任地披露安全问题,请将所发现的安全问题发送电子邮件到 paddle-security@baidu.com。 在安全团队收到邮件后将会及时与您沟通并反馈问题修复进度。 From 73f957cf56e9ee7fea5bb338adc91bc224daf1ce Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Thu, 30 Jun 2022 19:35:26 +0800 Subject: [PATCH 013/250] fused_gate_attention manual code in eager (#43897) * fused_gate_attention manual code in eager * refine * refine * refine * refine * refine * refine --- paddle/fluid/eager/amp_auto_cast.h | 1 + paddle/fluid/eager/api/CMakeLists.txt | 1 + paddle/fluid/eager/api/manual/CMakeLists.txt | 9 + .../api/manual/fluid_manual/CMakeLists.txt | 8 + .../manual/fluid_manual/dygraph_forward_api.h | 44 ++ .../fluid_manual/forwards/CMakeLists.txt | 10 + .../forwards/fused_gate_attention_fwd_func.cc | 389 ++++++++++++++++++ .../manual/fluid_manual/nodes/CMakeLists.txt | 8 + .../nodes/fused_gate_attention_node.cc | 233 +++++++++++ .../api/manual/fluid_manual/nodes/nodes.h | 176 ++++++++ .../auto_code_generator/eager_generator.cc | 13 +- .../generate_file_structures.py | 10 +- .../fused/fused_gate_attention_op.cu | 15 +- 13 files changed, 902 insertions(+), 15 deletions(-) create mode 100644 paddle/fluid/eager/api/manual/CMakeLists.txt create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/CMakeLists.txt create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h diff --git a/paddle/fluid/eager/amp_auto_cast.h b/paddle/fluid/eager/amp_auto_cast.h index ed05a6e69c026..5110f6f883e67 100644 --- a/paddle/fluid/eager/amp_auto_cast.h +++ b/paddle/fluid/eager/amp_auto_cast.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h" #include "paddle/fluid/framework/convert_utils.h" namespace egr { diff --git a/paddle/fluid/eager/api/CMakeLists.txt b/paddle/fluid/eager/api/CMakeLists.txt index 4525a58a44d48..0da46bbbfbbd6 100644 --- a/paddle/fluid/eager/api/CMakeLists.txt +++ b/paddle/fluid/eager/api/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(manual) add_subdirectory(utils) add_subdirectory(generated) diff --git a/paddle/fluid/eager/api/manual/CMakeLists.txt b/paddle/fluid/eager/api/manual/CMakeLists.txt new file mode 100644 index 0000000000000..ebfcaad2eeac7 --- /dev/null +++ b/paddle/fluid/eager/api/manual/CMakeLists.txt @@ -0,0 +1,9 @@ +if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) + add_subdirectory(fluid_manual) + set(fluid_manual_functions + ${fluid_manual_functions} + PARENT_SCOPE) + set(fluid_manual_nodes + ${fluid_manual_nodes} + PARENT_SCOPE) +endif() diff --git a/paddle/fluid/eager/api/manual/fluid_manual/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/CMakeLists.txt new file mode 100644 index 0000000000000..254f4a7246da7 --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/CMakeLists.txt @@ -0,0 +1,8 @@ +add_subdirectory(forwards) +add_subdirectory(nodes) +set(fluid_manual_functions + ${fluid_manual_functions} + PARENT_SCOPE) +set(fluid_manual_nodes + ${fluid_manual_nodes} + PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h new file mode 100644 index 0000000000000..3715544b923aa --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "glog/logging.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/phi/api/all.h" + +std::tuple +fused_gate_attention_dygraph_function( + const paddle::experimental::Tensor& Query, + const paddle::experimental::Tensor& Key, + const paddle::experimental::Tensor& QueryWeight, + const paddle::experimental::Tensor& KeyWeight, + const paddle::experimental::Tensor& ValueWeight, + const paddle::experimental::Tensor& QKVWeight, + const paddle::experimental::Tensor& NonbatchedBias, + const paddle::experimental::Tensor& SrcMask, + const paddle::experimental::Tensor& GateWeight, + const paddle::experimental::Tensor& GateBias, + const paddle::experimental::Tensor& OutLinearWeight, + const paddle::experimental::Tensor& OutLinearBias, + const paddle::framework::AttributeMap& attr_map); diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt new file mode 100644 index 0000000000000..2a7d72eb7cabd --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt @@ -0,0 +1,10 @@ +cc_library( + fused_gate_attention_fwd_func + SRCS fused_gate_attention_fwd_func.cc + DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + +add_dependencies(fused_gate_attention_fwd_func eager_codegen) + +set(fluid_manual_functions + fused_gate_attention_fwd_func + PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc new file mode 100644 index 0000000000000..81b4db4df207e --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc @@ -0,0 +1,389 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/amp_auto_cast.h" +#include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" + +#pragma GCC diagnostic ignored "-Wunused-variable" + +std::tuple +fused_gate_attention_dygraph_function( + const paddle::experimental::Tensor& Query, + const paddle::experimental::Tensor& Key, + const paddle::experimental::Tensor& QueryWeight, + const paddle::experimental::Tensor& KeyWeight, + const paddle::experimental::Tensor& ValueWeight, + const paddle::experimental::Tensor& QKVWeight, + const paddle::experimental::Tensor& NonbatchedBias, + const paddle::experimental::Tensor& SrcMask, + const paddle::experimental::Tensor& GateWeight, + const paddle::experimental::Tensor& GateBias, + const paddle::experimental::Tensor& OutLinearWeight, + const paddle::experimental::Tensor& OutLinearBias, + const paddle::framework::AttributeMap& attr_map) { + paddle::platform::RecordEvent dygraph_entrance_record_event( + "fused_gate_attention dygraph", + paddle::platform::TracerEventType::Operator, + 1); + VLOG(3) << "Running Eager Forward Op: fused_gate_attention"; + // Dygraph Forward Pass + + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + VLOG(5) << "Check and Prepare For AMP"; + + paddle::small_vector, + egr::kSlotSmallVectorSize> + amp_tensors_vector = { + {Query}, {SrcMask}, {OutLinearWeight}, {OutLinearBias}}; + if (Key.initialized()) amp_tensors_vector.push_back({Key}); + if (QueryWeight.initialized()) amp_tensors_vector.push_back({QueryWeight}); + if (KeyWeight.initialized()) amp_tensors_vector.push_back({KeyWeight}); + if (ValueWeight.initialized()) amp_tensors_vector.push_back({ValueWeight}); + if (QKVWeight.initialized()) amp_tensors_vector.push_back({QKVWeight}); + if (NonbatchedBias.initialized()) + amp_tensors_vector.push_back({NonbatchedBias}); + if (GateWeight.initialized()) amp_tensors_vector.push_back({GateWeight}); + if (GateBias.initialized()) amp_tensors_vector.push_back({GateBias}); + + auto amp_dst_dtype = + egr::GetAmpDestDtype("fused_gate_attention", amp_tensors_vector); + + auto NEW_Query = + egr::AmpAutoCast("Query", Query, amp_dst_dtype, "fused_gate_attention"); + auto NEW_SrcMask = egr::AmpAutoCast( + "SrcMask", SrcMask, amp_dst_dtype, "fused_gate_attention"); + auto NEW_OutLinearWeight = egr::AmpAutoCast("OutLinearWeight", + OutLinearWeight, + amp_dst_dtype, + "fused_gate_attention"); + auto NEW_OutLinearBias = egr::AmpAutoCast( + "OutLinearBias", OutLinearBias, amp_dst_dtype, "fused_gate_attention"); + auto NEW_Key = ((Key.initialized()) + ? egr::AmpAutoCast( + "Key", Key, amp_dst_dtype, "fused_gate_attention") + : Key); + auto NEW_QueryWeight = + ((QueryWeight.initialized()) ? egr::AmpAutoCast("QueryWeight", + QueryWeight, + amp_dst_dtype, + "fused_gate_attention") + : QueryWeight); + auto NEW_KeyWeight = + ((KeyWeight.initialized()) ? egr::AmpAutoCast("KeyWeight", + KeyWeight, + amp_dst_dtype, + "fused_gate_attention") + : KeyWeight); + auto NEW_ValueWeight = + ((ValueWeight.initialized()) ? egr::AmpAutoCast("ValueWeight", + ValueWeight, + amp_dst_dtype, + "fused_gate_attention") + : ValueWeight); + auto NEW_QKVWeight = + ((QKVWeight.initialized()) ? egr::AmpAutoCast("QKVWeight", + QKVWeight, + amp_dst_dtype, + "fused_gate_attention") + : QKVWeight); + auto NEW_NonbatchedBias = ((NonbatchedBias.initialized()) + ? egr::AmpAutoCast("NonbatchedBias", + NonbatchedBias, + amp_dst_dtype, + "fused_gate_attention") + : NonbatchedBias); + auto NEW_GateWeight = + ((GateWeight.initialized()) ? egr::AmpAutoCast("GateWeight", + GateWeight, + amp_dst_dtype, + "fused_gate_attention") + : GateWeight); + auto NEW_GateBias = + ((GateBias.initialized()) + ? egr::AmpAutoCast( + "GateBias", GateBias, amp_dst_dtype, "fused_gate_attention") + : GateBias); + + { + paddle::imperative::AutoCastGuard guard( + egr::Controller::Instance().GetCurrentTracer(), + paddle::imperative::AmpLevel::O0); + return fused_gate_attention_dygraph_function(NEW_Query, + NEW_Key, + NEW_QueryWeight, + NEW_KeyWeight, + NEW_ValueWeight, + NEW_QKVWeight, + NEW_NonbatchedBias, + NEW_SrcMask, + NEW_GateWeight, + NEW_GateBias, + NEW_OutLinearWeight, + NEW_OutLinearBias, + attr_map); + } + } + + std::map>> ins = + {{"Query", egr::EagerUtils::TrySyncToVars(Query)}, + {"SrcMask", egr::EagerUtils::TrySyncToVars(SrcMask)}, + {"OutLinearWeight", egr::EagerUtils::TrySyncToVars(OutLinearWeight)}, + {"OutLinearBias", egr::EagerUtils::TrySyncToVars(OutLinearBias)}}; + if (Key.initialized()) ins["Key"] = egr::EagerUtils::TrySyncToVars(Key); + if (QueryWeight.initialized()) + ins["QueryWeight"] = egr::EagerUtils::TrySyncToVars(QueryWeight); + if (KeyWeight.initialized()) + ins["KeyWeight"] = egr::EagerUtils::TrySyncToVars(KeyWeight); + if (ValueWeight.initialized()) + ins["ValueWeight"] = egr::EagerUtils::TrySyncToVars(ValueWeight); + if (QKVWeight.initialized()) + ins["QKVWeight"] = egr::EagerUtils::TrySyncToVars(QKVWeight); + if (NonbatchedBias.initialized()) + ins["NonbatchedBias"] = egr::EagerUtils::TrySyncToVars(NonbatchedBias); + if (GateWeight.initialized()) + ins["GateWeight"] = egr::EagerUtils::TrySyncToVars(GateWeight); + if (GateBias.initialized()) + ins["GateBias"] = egr::EagerUtils::TrySyncToVars(GateBias); + + std::map>> outs = + {{"QueryTransposeOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"KeyTransposeOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"ValueTransposeOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"QKVTransposeOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"SoftmaxOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"FMHAOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"GateOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}}; + + // Prepare Autograd Meta + egr::AutogradMeta* p_autograd_Query = + egr::EagerUtils::nullable_autograd_meta(Query); + egr::AutogradMeta* p_autograd_Key = + egr::EagerUtils::nullable_autograd_meta(Key); + egr::AutogradMeta* p_autograd_QueryWeight = + egr::EagerUtils::nullable_autograd_meta(QueryWeight); + egr::AutogradMeta* p_autograd_KeyWeight = + egr::EagerUtils::nullable_autograd_meta(KeyWeight); + egr::AutogradMeta* p_autograd_ValueWeight = + egr::EagerUtils::nullable_autograd_meta(ValueWeight); + egr::AutogradMeta* p_autograd_QKVWeight = + egr::EagerUtils::nullable_autograd_meta(QKVWeight); + egr::AutogradMeta* p_autograd_NonbatchedBias = + egr::EagerUtils::nullable_autograd_meta(NonbatchedBias); + egr::AutogradMeta* p_autograd_SrcMask = + egr::EagerUtils::nullable_autograd_meta(SrcMask); + egr::AutogradMeta* p_autograd_GateWeight = + egr::EagerUtils::nullable_autograd_meta(GateWeight); + egr::AutogradMeta* p_autograd_GateBias = + egr::EagerUtils::nullable_autograd_meta(GateBias); + egr::AutogradMeta* p_autograd_OutLinearWeight = + egr::EagerUtils::nullable_autograd_meta(OutLinearWeight); + egr::AutogradMeta* p_autograd_OutLinearBias = + egr::EagerUtils::nullable_autograd_meta(OutLinearBias); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + + bool require_any_grad = + egr::EagerUtils::ComputeRequireGrad(trace_backward, + p_autograd_Query, + p_autograd_Key, + p_autograd_QueryWeight, + p_autograd_KeyWeight, + p_autograd_ValueWeight, + p_autograd_QKVWeight, + p_autograd_NonbatchedBias, + p_autograd_SrcMask, + p_autograd_GateWeight, + p_autograd_GateBias, + p_autograd_OutLinearWeight, + p_autograd_OutLinearBias); + + paddle::framework::AttributeMap attrs = attr_map; + paddle::framework::AttributeMap default_attrs; + egr::Controller::Instance().GetCurrentTracer()->TraceOp( + "fused_gate_attention", + ins, + outs, + attrs, + egr::Controller::Instance().GetExpectedPlace(), + &default_attrs, + true, + {}); + + paddle::experimental::Tensor QueryTransposeOut; + egr::EagerUtils::GetOutput(outs["QueryTransposeOut"][0], &QueryTransposeOut); + paddle::experimental::Tensor KeyTransposeOut; + egr::EagerUtils::GetOutput(outs["KeyTransposeOut"][0], &KeyTransposeOut); + paddle::experimental::Tensor ValueTransposeOut; + egr::EagerUtils::GetOutput(outs["ValueTransposeOut"][0], &ValueTransposeOut); + paddle::experimental::Tensor QKVTransposeOut; + egr::EagerUtils::GetOutput(outs["QKVTransposeOut"][0], &QKVTransposeOut); + paddle::experimental::Tensor SoftmaxOut; + egr::EagerUtils::GetOutput(outs["SoftmaxOut"][0], &SoftmaxOut); + paddle::experimental::Tensor FMHAOut; + egr::EagerUtils::GetOutput(outs["FMHAOut"][0], &FMHAOut); + paddle::experimental::Tensor GateOut; + egr::EagerUtils::GetOutput(outs["GateOut"][0], &GateOut); + paddle::experimental::Tensor Out; + egr::EagerUtils::GetOutput(outs["Out"][0], &Out); + + { + paddle::platform::RecordEvent node_creation_record_event( + "fused_gate_attention node_creation", + paddle::platform::TracerEventType::Operator, + 1); + egr::AutogradMeta* p_autograd_QueryTransposeOut = + egr::EagerUtils::autograd_meta(&QueryTransposeOut); + egr::AutogradMeta* p_autograd_KeyTransposeOut = + egr::EagerUtils::autograd_meta(&KeyTransposeOut); + egr::AutogradMeta* p_autograd_ValueTransposeOut = + egr::EagerUtils::autograd_meta(&ValueTransposeOut); + egr::AutogradMeta* p_autograd_QKVTransposeOut = + egr::EagerUtils::autograd_meta(&QKVTransposeOut); + egr::AutogradMeta* p_autograd_SoftmaxOut = + egr::EagerUtils::autograd_meta(&SoftmaxOut); + egr::AutogradMeta* p_autograd_FMHAOut = + egr::EagerUtils::autograd_meta(&FMHAOut); + egr::AutogradMeta* p_autograd_GateOut = + egr::EagerUtils::autograd_meta(&GateOut); + egr::AutogradMeta* p_autograd_Out = egr::EagerUtils::autograd_meta(&Out); + if (require_any_grad) { + VLOG(6) << " Construct Grad for fused_gate_attention "; + egr::EagerUtils::PassStopGradient(false, + p_autograd_QueryTransposeOut, + p_autograd_KeyTransposeOut, + p_autograd_ValueTransposeOut, + p_autograd_QKVTransposeOut, + p_autograd_SoftmaxOut, + p_autograd_FMHAOut, + p_autograd_GateOut, + p_autograd_Out); + // Create GradOpNode + auto grad_node = std::shared_ptr( + new fused_gate_attentionGradNodeCompat(8, 12)); + + bool merge_qkv = true; + if (attrs.count("merge_qkv")) { + merge_qkv = BOOST_GET_CONST(bool, attrs.at("merge_qkv")); + } + + bool has_gating = true; + if (attrs.count("has_gating")) { + has_gating = BOOST_GET_CONST(bool, attrs.at("has_gating")); + } + + // Set Attributes + grad_node->SetAttrMap(std::move(attrs)); + grad_node->SetDefaultAttrMap(std::move(default_attrs)); + + grad_node->SetTensorWrapperFMHAOut(FMHAOut); + grad_node->SetTensorWrapperQuery(Query); + grad_node->SetTensorWrapperSoftmaxOut(SoftmaxOut); + grad_node->SetTensorWrapperOutLinearBias(OutLinearBias); + grad_node->SetTensorWrapperOutLinearWeight(OutLinearWeight); + + grad_node->SetGradOutMeta(Query, 0); + grad_node->SetGradOutMeta(OutLinearWeight, 10); + grad_node->SetGradOutMeta(OutLinearBias, 11); + + if (merge_qkv) { + grad_node->SetTensorWrapperQKVTransposeOut(QKVTransposeOut); + grad_node->SetTensorWrapperQKVWeight(QKVWeight); + grad_node->SetGradOutMeta(QKVWeight, 5); + } else { + grad_node->SetTensorWrapperKey(Key); + grad_node->SetTensorWrapperQueryWeight(QueryWeight); + grad_node->SetTensorWrapperKeyWeight(KeyWeight); + grad_node->SetTensorWrapperValueWeight(ValueWeight); + grad_node->SetTensorWrapperQueryTransposeOut(QueryTransposeOut); + grad_node->SetTensorWrapperKeyTransposeOut(KeyTransposeOut); + grad_node->SetTensorWrapperValueTransposeOut(ValueTransposeOut); + + grad_node->SetGradOutMeta(Key, 1); + grad_node->SetGradOutMeta(QueryWeight, 2); + grad_node->SetGradOutMeta(KeyWeight, 3); + grad_node->SetGradOutMeta(ValueWeight, 4); + } + + if (has_gating) { + grad_node->SetTensorWrapperGateWeight(GateWeight); + grad_node->SetGradOutMeta(GateWeight, 8); + grad_node->SetTensorWrapperGateBias(GateBias); + grad_node->SetGradOutMeta(GateBias, 9); + grad_node->SetTensorWrapperGateOut(GateOut); + } + + if (NonbatchedBias.initialized()) { + grad_node->SetTensorWrapperNonbatchedBias(NonbatchedBias); + grad_node->SetGradOutMeta(NonbatchedBias, 6); + } + + egr::EagerUtils::SetOutRankWithSlot(p_autograd_QueryTransposeOut, 0); + grad_node->SetGradInMeta(QueryTransposeOut, 0); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_KeyTransposeOut, 1); + grad_node->SetGradInMeta(KeyTransposeOut, 1); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_ValueTransposeOut, 2); + grad_node->SetGradInMeta(ValueTransposeOut, 2); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKVTransposeOut, 3); + grad_node->SetGradInMeta(QKVTransposeOut, 3); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_SoftmaxOut, 4); + grad_node->SetGradInMeta(SoftmaxOut, 4); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_FMHAOut, 5); + grad_node->SetGradInMeta(FMHAOut, 5); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_GateOut, 6); + grad_node->SetGradInMeta(GateOut, 6); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 7); + egr::EagerUtils::SetHistory(p_autograd_Out, grad_node); + grad_node->SetGradInMeta(Out, 7); + egr::EagerUtils::CheckAndRetainGrad(Out); + } + } + + return std::make_tuple(QueryTransposeOut, + KeyTransposeOut, + ValueTransposeOut, + QKVTransposeOut, + SoftmaxOut, + FMHAOut, + GateOut, + Out); +} diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt new file mode 100644 index 0000000000000..fb5e129223544 --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt @@ -0,0 +1,8 @@ +cc_library( + fused_gate_attention_node + SRCS fused_gate_attention_node.cc + DEPS ${eager_deps} ${fluid_deps}) + +set(fluid_manual_nodes + fused_gate_attention_node + PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc new file mode 100644 index 0000000000000..a1ccaf09de8b4 --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/phi/api/all.h" + +paddle::small_vector, + egr::kSlotSmallVectorSize> +fused_gate_attentionGradNodeCompat::operator()( + paddle::small_vector, + egr::kSlotSmallVectorSize>& grads, + bool create_graph, + bool is_new_grad) { + VLOG(3) << "Running Eager Backward Node: fused_gate_attentionGradNodeCompat"; + + const auto& out_metas = OutputMeta(); + paddle::small_vector, + egr::kSlotSmallVectorSize> + outputs(12); + paddle::small_vector, + egr::kSlotSmallVectorSize> + hooked_grads0 = + fused_gate_attentionGradNodeCompat::ApplyGradientHooks(grads); + + bool merge_qkv = true; + if (attr_map_.count("merge_qkv")) { + merge_qkv = BOOST_GET_CONST(bool, attr_map_.at("merge_qkv")); + } + + bool has_gating = true; + if (attr_map_.count("has_gating")) { + has_gating = BOOST_GET_CONST(bool, attr_map_.at("has_gating")); + } + + std::map>> ins0 = + {{"FMHAOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->FMHAOut_))}, + {"Out@GRAD", egr::EagerUtils::TrySyncToVars(hooked_grads0[7])}, + {"OutLinearBias", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearBias_))}, + {"OutLinearWeight", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearWeight_))}, + {"Query", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Query_))}, + {"SoftmaxOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->SoftmaxOut_))}}; + std::map>> outs0; + + if ((!out_metas[11].empty()) && (!(out_metas[11][0].IsStopGradient()))) { + outs0.insert({"OutLinearBias@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if ((!out_metas[10].empty()) && (!(out_metas[10][0].IsStopGradient()))) { + outs0.insert({"OutLinearWeight@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) { + outs0.insert({"Query@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + + if (merge_qkv) { + auto QKVTransposeOut = + egr::EagerUtils::RecoverTensorWrapper(&this->QKVTransposeOut_); + if (QKVTransposeOut.defined()) + ins0["QKVTransposeOut"] = egr::EagerUtils::TrySyncToVars(QKVTransposeOut); + auto QKVWeight = egr::EagerUtils::RecoverTensorWrapper(&this->QKVWeight_); + if (QKVWeight.defined()) + ins0["QKVWeight"] = egr::EagerUtils::TrySyncToVars(QKVWeight); + if (QKVWeight.defined() && (!out_metas[5].empty()) && + (!out_metas[5][0].IsStopGradient())) + outs0["QKVWeight@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } else { + auto Key = egr::EagerUtils::RecoverTensorWrapper(&this->Key_); + if (Key.defined()) ins0["Key"] = egr::EagerUtils::TrySyncToVars(Key); + auto QueryWeight = + egr::EagerUtils::RecoverTensorWrapper(&this->QueryWeight_); + if (QueryWeight.defined()) + ins0["QueryWeight"] = egr::EagerUtils::TrySyncToVars(QueryWeight); + auto KeyWeight = egr::EagerUtils::RecoverTensorWrapper(&this->KeyWeight_); + if (KeyWeight.defined()) + ins0["KeyWeight"] = egr::EagerUtils::TrySyncToVars(KeyWeight); + auto ValueWeight = + egr::EagerUtils::RecoverTensorWrapper(&this->ValueWeight_); + if (ValueWeight.defined()) + ins0["ValueWeight"] = egr::EagerUtils::TrySyncToVars(ValueWeight); + auto QueryTransposeOut = + egr::EagerUtils::RecoverTensorWrapper(&this->QueryTransposeOut_); + if (QueryTransposeOut.defined()) + ins0["QueryTransposeOut"] = + egr::EagerUtils::TrySyncToVars(QueryTransposeOut); + auto KeyTransposeOut = + egr::EagerUtils::RecoverTensorWrapper(&this->KeyTransposeOut_); + if (KeyTransposeOut.defined()) + ins0["KeyTransposeOut"] = egr::EagerUtils::TrySyncToVars(KeyTransposeOut); + auto ValueTransposeOut = + egr::EagerUtils::RecoverTensorWrapper(&this->ValueTransposeOut_); + if (ValueTransposeOut.defined()) + ins0["ValueTransposeOut"] = + egr::EagerUtils::TrySyncToVars(ValueTransposeOut); + + if (Key.defined() && (!out_metas[1].empty()) && + (!out_metas[1][0].IsStopGradient())) + outs0["Key@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + if (QueryWeight.defined() && (!out_metas[2].empty()) && + (!out_metas[2][0].IsStopGradient())) + outs0["QueryWeight@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + if (KeyWeight.defined() && (!out_metas[3].empty()) && + (!out_metas[3][0].IsStopGradient())) + outs0["KeyWeight@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + if (ValueWeight.defined() && (!out_metas[4].empty()) && + (!out_metas[4][0].IsStopGradient())) + outs0["ValueWeight@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + if (has_gating) { + auto GateBias = egr::EagerUtils::RecoverTensorWrapper(&this->GateBias_); + if (GateBias.defined()) + ins0["GateBias"] = egr::EagerUtils::TrySyncToVars(GateBias); + auto GateWeight = egr::EagerUtils::RecoverTensorWrapper(&this->GateWeight_); + if (GateWeight.defined()) + ins0["GateWeight"] = egr::EagerUtils::TrySyncToVars(GateWeight); + auto GateOut = egr::EagerUtils::RecoverTensorWrapper(&this->GateOut_); + if (GateOut.defined()) + ins0["GateOut"] = egr::EagerUtils::TrySyncToVars(GateOut); + if (GateBias.defined() && (!out_metas[9].empty()) && + (!out_metas[9][0].IsStopGradient())) + outs0["GateBias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + if (GateWeight.defined() && (!out_metas[8].empty()) && + (!out_metas[8][0].IsStopGradient())) + outs0["GateWeight@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto NonbatchedBias = + egr::EagerUtils::RecoverTensorWrapper(&this->NonbatchedBias_); + if (NonbatchedBias.defined()) { + ins0["NonbatchedBias"] = egr::EagerUtils::TrySyncToVars(NonbatchedBias); + if ((!out_metas[6].empty()) && (!out_metas[6][0].IsStopGradient())) + outs0["NonbatchedBias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto& attrs_map0 = this->attr_map_; + // Pass the entire attribute map to TraceOp + // The underlying kernel will pickup whatever attribute they need at runtime + egr::Controller::Instance().GetCurrentTracer()->TraceOp( + "fused_gate_attention_grad", + ins0, + outs0, + attrs_map0, + egr::Controller::Instance().GetExpectedPlace(), + &this->default_attr_map_, + false, + {}); + + if (outs0.find("Query@GRAD") != outs0.end()) { + outputs[0] = egr::EagerUtils::GetOutputs(outs0["Query@GRAD"]); + } + if (outs0.find("OutLinearBias@GRAD") != outs0.end()) { + outputs[11] = egr::EagerUtils::GetOutputs(outs0["OutLinearBias@GRAD"]); + } + if (outs0.find("OutLinearWeight@GRAD") != outs0.end()) { + outputs[10] = egr::EagerUtils::GetOutputs(outs0["OutLinearWeight@GRAD"]); + } + + if (merge_qkv) { + if (outs0.find("QKVWeight@GRAD") != outs0.end()) { + outputs[5] = egr::EagerUtils::GetOutputs(outs0["QKVWeight@GRAD"]); + } + } else { + if (outs0.find("Key@GRAD") != outs0.end()) { + outputs[1] = egr::EagerUtils::GetOutputs(outs0["Key@GRAD"]); + } + if (outs0.find("QueryWeight@GRAD") != outs0.end()) { + outputs[2] = egr::EagerUtils::GetOutputs(outs0["QueryWeight@GRAD"]); + } + if (outs0.find("KeyWeight@GRAD") != outs0.end()) { + outputs[3] = egr::EagerUtils::GetOutputs(outs0["KeyWeight@GRAD"]); + } + if (outs0.find("ValueWeight@GRAD") != outs0.end()) { + outputs[4] = egr::EagerUtils::GetOutputs(outs0["ValueWeight@GRAD"]); + } + } + + if (has_gating) { + if (outs0.find("GateBias@GRAD") != outs0.end()) { + outputs[9] = egr::EagerUtils::GetOutputs(outs0["GateBias@GRAD"]); + } + if (outs0.find("GateWeight@GRAD") != outs0.end()) { + outputs[8] = egr::EagerUtils::GetOutputs(outs0["GateWeight@GRAD"]); + } + } + + if (NonbatchedBias.defined()) { + if (outs0.find("NonbatchedBias@GRAD") != outs0.end()) { + outputs[6] = egr::EagerUtils::GetOutputs(outs0["NonbatchedBias@GRAD"]); + } + } + + if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs); + return outputs; +} diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h new file mode 100644 index 0000000000000..0f0fac4b725e0 --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h @@ -0,0 +1,176 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tensor_wrapper.h" +#include "paddle/fluid/imperative/tracer.h" + +class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase { + public: + fused_gate_attentionGradNodeCompat() : egr::GradNodeBase() { + VLOG(7) << " Construct fused_gate_attentionGradNodeCompat "; + } + fused_gate_attentionGradNodeCompat(size_t bwd_in_slot_num, + size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { + VLOG(7) << " Construct fused_gate_attentionGradNodeCompat "; + } + ~fused_gate_attentionGradNodeCompat() override { + VLOG(6) << " Destruct fused_gate_attentionGradNodeCompat "; + } + + virtual paddle::small_vector, + egr::kSlotSmallVectorSize> + operator()( + paddle::small_vector, // NOLINT + egr::kSlotSmallVectorSize>& grads, // NOLINT + bool create_graph = false, + bool is_new_grad = false) override; + + void ClearTensorWrappers() override { + FMHAOut_.clear(); + GateBias_.clear(); + GateOut_.clear(); + GateWeight_.clear(); + NonbatchedBias_.clear(); + OutLinearBias_.clear(); + OutLinearWeight_.clear(); + QKVTransposeOut_.clear(); + QKVWeight_.clear(); + Query_.clear(); + SoftmaxOut_.clear(); + Key_.clear(); + QueryWeight_.clear(); + KeyWeight_.clear(); + ValueWeight_.clear(); + QueryTransposeOut_.clear(); + KeyTransposeOut_.clear(); + ValueTransposeOut_.clear(); + + SetIsTensorWrappersCleared(true); + } + std::string name() override { return "fused_gate_attentionGradNodeCompat"; } + + std::shared_ptr Copy() const override { + { + auto copied_node = std::shared_ptr( + new fused_gate_attentionGradNodeCompat(*this)); + return copied_node; + } + } + + // SetX, SetY, ... + void SetTensorWrapperFMHAOut(const paddle::experimental::Tensor& FMHAOut) { + FMHAOut_ = egr::TensorWrapper(FMHAOut, false); + } + void SetTensorWrapperGateBias(const paddle::experimental::Tensor& GateBias) { + GateBias_ = egr::TensorWrapper(GateBias, false); + } + void SetTensorWrapperGateOut(const paddle::experimental::Tensor& GateOut) { + GateOut_ = egr::TensorWrapper(GateOut, false); + } + void SetTensorWrapperGateWeight( + const paddle::experimental::Tensor& GateWeight) { + GateWeight_ = egr::TensorWrapper(GateWeight, false); + } + void SetTensorWrapperNonbatchedBias( + const paddle::experimental::Tensor& NonbatchedBias) { + NonbatchedBias_ = egr::TensorWrapper(NonbatchedBias, false); + } + void SetTensorWrapperOutLinearBias( + const paddle::experimental::Tensor& OutLinearBias) { + OutLinearBias_ = egr::TensorWrapper(OutLinearBias, false); + } + void SetTensorWrapperOutLinearWeight( + const paddle::experimental::Tensor& OutLinearWeight) { + OutLinearWeight_ = egr::TensorWrapper(OutLinearWeight, false); + } + void SetTensorWrapperQKVTransposeOut( + const paddle::experimental::Tensor& QKVTransposeOut) { + QKVTransposeOut_ = egr::TensorWrapper(QKVTransposeOut, false); + } + void SetTensorWrapperQKVWeight( + const paddle::experimental::Tensor& QKVWeight) { + QKVWeight_ = egr::TensorWrapper(QKVWeight, false); + } + void SetTensorWrapperQuery(const paddle::experimental::Tensor& Query) { + Query_ = egr::TensorWrapper(Query, false); + } + void SetTensorWrapperSoftmaxOut( + const paddle::experimental::Tensor& SoftmaxOut) { + SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false); + } + void SetTensorWrapperKey(const paddle::experimental::Tensor& Key) { + Key_ = egr::TensorWrapper(Key, false); + } + void SetTensorWrapperQueryWeight( + const paddle::experimental::Tensor& QueryWeight) { + QueryWeight_ = egr::TensorWrapper(QueryWeight, false); + } + void SetTensorWrapperKeyWeight( + const paddle::experimental::Tensor& KeyWeight) { + KeyWeight_ = egr::TensorWrapper(KeyWeight, false); + } + void SetTensorWrapperValueWeight( + const paddle::experimental::Tensor& ValueWeight) { + ValueWeight_ = egr::TensorWrapper(ValueWeight, false); + } + void SetTensorWrapperQueryTransposeOut( + const paddle::experimental::Tensor& QueryTransposeOut) { + QueryTransposeOut_ = egr::TensorWrapper(QueryTransposeOut, false); + } + void SetTensorWrapperKeyTransposeOut( + const paddle::experimental::Tensor& KeyTransposeOut) { + KeyTransposeOut_ = egr::TensorWrapper(KeyTransposeOut, false); + } + void SetTensorWrapperValueTransposeOut( + const paddle::experimental::Tensor& ValueTransposeOut) { + ValueTransposeOut_ = egr::TensorWrapper(ValueTransposeOut, false); + } + + // SetAttrMap + void SetAttrMap(paddle::framework::AttributeMap&& attr_map) { + attr_map_ = std::move(attr_map); + } + void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) { + default_attr_map_ = std::move(default_attr_map); + } + + private: + // TensorWrappers + egr::TensorWrapper FMHAOut_; + egr::TensorWrapper GateBias_; + egr::TensorWrapper GateOut_; + egr::TensorWrapper GateWeight_; + egr::TensorWrapper NonbatchedBias_; + egr::TensorWrapper OutLinearBias_; + egr::TensorWrapper OutLinearWeight_; + egr::TensorWrapper QKVTransposeOut_; + egr::TensorWrapper QKVWeight_; + egr::TensorWrapper Query_; + egr::TensorWrapper SoftmaxOut_; + + egr::TensorWrapper Key_; + egr::TensorWrapper QueryWeight_; + egr::TensorWrapper KeyWeight_; + egr::TensorWrapper ValueWeight_; + egr::TensorWrapper QueryTransposeOut_; + egr::TensorWrapper KeyTransposeOut_; + egr::TensorWrapper ValueTransposeOut_; + + // Attribute Map + paddle::framework::AttributeMap attr_map_; + paddle::framework::AttributeMap default_attr_map_; +}; diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 6910f9e537fc8..bbd6ea6494638 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -51,7 +51,8 @@ static std::unordered_set ops_to_fill_zero_for_empty_grads = { "split", "rnn"}; /* --- Black Ops list that's NO NEED to apply code generation --- */ -static std::unordered_set black_ops_list = {"run_program"}; +static std::unordered_set black_ops_list = { + "run_program", "fused_gate_attention"}; static std::string LegalizeVariableName(const std::string& var_name) { std::string ret = var_name; @@ -2972,7 +2973,10 @@ static std::string GenerateDygraphHFileIncludes() { "#include \"paddle/phi/api/all.h\"\n" "#include \"paddle/fluid/eager/utils.h\"\n" "#include \"paddle/fluid/imperative/tracer.h\"\n" - "#include \"paddle/fluid/framework/op_registry.h\"\n\n"; + "#include \"paddle/fluid/framework/op_registry.h\"\n" + "#include " + "\"paddle/fluid/eager/api/manual/fluid_manual/" + "dygraph_forward_api.h\"\n\n"; dygraph_forward_api_includes_str += "extern std::unordered_map> " @@ -3021,7 +3025,10 @@ static void GenerateNodeHFile(const std::string& node_h_path, "#pragma once\n" "#include \"paddle/fluid/eager/tensor_wrapper.h\"\n" "#include \"paddle/fluid/imperative/tracer.h\"\n" - "#include \"paddle/fluid/eager/grad_node_info.h\"\n\n"; + "#include \"paddle/fluid/eager/grad_node_info.h\"\n" + "#include " + "\"paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h\"\n\n"; + std::ofstream node_h_stream(node_h_path, std::ios::out); node_h_stream << node_h_include_str; node_h_stream << grad_node_str; diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py index fdb8529515d30..a7cd1dc8c4673 100644 --- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py +++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py @@ -1,11 +1,11 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -103,13 +103,13 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir): with open(nodes_level_cmakelist_path, "w") as f: f.write( - "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps})\n" + "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps} ${fluid_manual_nodes})\n" ) f.write("add_dependencies(dygraph_node eager_codegen)") with open(forwards_level_cmakelist_path, "w") as f: f.write( - "cc_library(dygraph_function SRCS dygraph_forward_functions.cc DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})\n" + "cc_library(dygraph_function SRCS dygraph_forward_functions.cc DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${fluid_manual_functions})\n" ) f.write("add_dependencies(dygraph_function eager_codegen)") diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 0d219a4f76d16..7400246f40725 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -363,13 +363,14 @@ class FusedGateAttentionOpKernel : public framework::OpKernel { dev_ctx, query, key, query_weight, qkv_weight, merge_qkv, has_gating); if (merge_qkv) { - PADDLE_ENFORCE_EQ(!key || query == key, - true, - platform::errors::InvalidArgument( - "key is expected to be nullptr or the same as " - "query, but recieved key=%p, query=%p.", - key, - query)); + PADDLE_ENFORCE_EQ( + !key || query == key || query->data() == key->data(), + true, + platform::errors::InvalidArgument( + "key is expected to be nullptr or the same as " + "query, but recieved key=%p, query=%p.", + key, + query)); // 1. Merged QKV Matmul: einsum(nbhqk,nbkhc -> nbqhc) Tensor *qkv_out = config.GetQKVOut(); From 56ddd7c2dc62d53f3a9c943a66ae8262e920b245 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Thu, 30 Jun 2022 20:36:59 +0800 Subject: [PATCH 014/250] remove decrease_axis in op_teller.cc , support them in slice (#43963) --- .../tensorrt/convert/batch_norm_op.cc | 2 +- .../tensorrt/convert/elementwise_op.cc | 16 +++ .../inference/tensorrt/convert/slice_op.cc | 23 ++++ paddle/fluid/inference/tensorrt/op_teller.cc | 11 +- .../inference/test_trt_convert_elementwise.py | 103 ++++++++++++++++++ .../ir/inference/test_trt_convert_slice.py | 7 -- 6 files changed, 146 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 376dfa32e879f..c5dae16bc3cac 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -169,7 +169,7 @@ class BatchNormOpConverter : public OpConverter { engine_->SetWeights(op_desc.Input("Scale").front(), std::move(combile_scale_tensor)); if (x_dim.nbDims < 3 + dynamic_shape_offset) { - layer->getOutput(0)->setName("batch_norm_out"); + layer->getOutput(0)->setName(("BN: ScaleNd: " + output_name).c_str()); layer->setName(("BN: ScaleNd: (Output: " + output_name + ")").c_str()); nvinfer1::Dims squeeze_shape; squeeze_shape.nbDims = x_dim.nbDims; diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 2d342a6f7040d..aff23343a078f 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -44,6 +44,22 @@ class ElementwiseTensorOpConverter : public OpConverter { for (int i = 0; i < trt_dims_y.nbDims; i++) { trt_dims_y.d[i] = dims_y[i]; } + // this is the special case when dims_y includes batch dimension! + // we need remove batch dimension! + if (!engine_->with_dynamic_shape() && + trt_dims_y.nbDims == (X->getDimensions().nbDims + 1)) { + trt_dims_y.nbDims--; + PADDLE_ENFORCE_EQ(trt_dims_y.d[0], + 1, + platform::errors::InvalidArgument( + "Elementwise type(%s) op's Y is a weight " + "including batch dimension. Please " + "check if the 0th dimension equals 1.", + op_type_)); + for (int i = 0; i < trt_dims_y.nbDims; i++) { + trt_dims_y.d[i] = trt_dims_y.d[i + 1]; + } + } Y = TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims_y, y_weight.get()) ->getOutput(0); } else { diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index bcf5e638126e2..4f85e4f07cc4e 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -166,6 +166,29 @@ class SliceOpConverter : public OpConverter { } layer = TRT_ENGINE_ADD_LAYER( engine_, Slice, *input, trt_start_dims, trt_size_dims, trt_step_dims); + nvinfer1::Dims real_trt_size_dims; + real_trt_size_dims.nbDims = 0; + + if (decrease_axises.size() > 0) { + for (size_t i = 0; i < decrease_axises.size(); i++) { + decrease_axises[i]--; + } + for (int i = 0; i < trt_size_dims.nbDims; i++) { + if (decrease_axises.end() != + std::find(decrease_axises.begin(), decrease_axises.end(), i)) + continue; + real_trt_size_dims.d[real_trt_size_dims.nbDims] = trt_size_dims.d[i]; + real_trt_size_dims.nbDims++; + } + if (real_trt_size_dims.nbDims == 0) { + real_trt_size_dims.nbDims = 1; + real_trt_size_dims.d[0] = 1; + } + auto reshape_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0)); + reshape_layer->setReshapeDimensions(real_trt_size_dims); + layer = static_cast(reshape_layer); + } #else bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 1ee748afe507c..190f6c731a3b4 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -1217,14 +1217,9 @@ bool OpTeller::Tell(const framework::ir::Node* node, if (desc.HasAttr("decrease_axis")) { std::vector decrease_axis = BOOST_GET_CONST(std::vector, desc.GetAttr("decrease_axis")); - if (with_dynamic_shape) { - if (decrease_axis.size() > 1) { - return false; - } - } else { - if (decrease_axis.size() > 0) { - VLOG(3) << "Invalid slice decrease_axis. decrease_axis.size() > 0" - "is not supported in TensorRT"; + if (!with_dynamic_shape) { + if (decrease_axis.end() != + std::find(decrease_axis.begin(), decrease_axis.end(), 0)) { return false; } } diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index 2fabc6013893e..db011c5bd54f6 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -21,6 +21,109 @@ from typing import Optional, List, Callable, Dict, Any, Set +# This is the special test case with weight including batch dimension +# I don't want to mess up the code written by others, so I wrote a class specifically +class TrtConvertElementwiseTest_one_input_special_case0(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + + def generate_input(shape): + return np.random.random(shape).astype(np.float32) + + def generate_weight(): + return np.random.randn(1, 32, 1, 1).astype(np.float32) + + for batch in [1, 4]: + for shape in [[batch, 32, 16, 32]]: + for op_type in ["elementwise_add", "elementwise_mul"]: + for axis in [-1]: + self.dims = len(shape) + dics = [{"axis": axis}] + ops_config = [{ + "op_type": op_type, + "op_inputs": { + "X": ["input_data"], + "Y": ["weight"] + }, + "op_outputs": { + "Out": ["output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "weight": + TensorConfig(data_gen=partial(generate_weight)) + }, + inputs={ + "input_data": + TensorConfig( + data_gen=partial(generate_input, shape)), + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(attrs): + # The input.dims[1] must be equal to the weight's length. + if self.dims == 4: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 32, 4, 4] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 32, 32, 32] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [4, 32, 16, 32] + } + + def clear_dynamic_shape(): + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + + def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + class TrtConvertElementwiseTest_one_input(TrtLayerAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py index 76a84c77122c5..deac7ef9d2a14 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py @@ -111,13 +111,6 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - inputs = program_config.inputs - if dynamic_shape == True and len(attrs[0]["decrease_axis"]) == 0: - return 1, 2 - if dynamic_shape == True and len(attrs[0]["decrease_axis"]) != 1: - return 0, 3 - if dynamic_shape == False and len(attrs[0]["decrease_axis"]) != 0: - return 0, 3 if not dynamic_shape: for x in attrs[0]["axes"]: if x == 0: From f33763e30b0f0cd9aa3ea5fb59e1e292a1cde2e4 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 30 Jun 2022 21:29:30 +0800 Subject: [PATCH 015/250] Move apis(digamma, dist, dot) from legacy_api.yaml to api.yaml (#43956) * move standard apis to api.yaml * revert erfinv * delete dot_op.h * fix dot * rerun ci --- paddle/fluid/operators/digamma_op.cc | 98 ------------ paddle/fluid/operators/digamma_op.h | 18 --- paddle/fluid/operators/dist_op.cc | 140 ------------------ paddle/fluid/operators/dot_op.cc | 139 ----------------- paddle/fluid/operators/dot_op.cu | 36 ----- paddle/fluid/operators/dot_op.h | 83 ----------- paddle/fluid/operators/matmul_v2_op.h | 1 - paddle/phi/kernels/digamma_kernel.h | 7 + paddle/phi/kernels/dist_kernel.h | 31 ++++ paddle/phi/kernels/erfinv_kernel.h | 12 ++ paddle/phi/ops/compat/digamma_sig.cc | 26 ---- paddle/phi/ops/compat/dist_sig.cc | 26 ---- paddle/phi/ops/compat/dot_sig.cc | 26 ---- .../fluid/tests/unittests/test_dot_op.py | 36 +++-- python/paddle/tensor/linalg.py | 9 +- python/paddle/utils/code_gen/api.yaml | 28 ++++ python/paddle/utils/code_gen/backward.yaml | 31 ++++ python/paddle/utils/code_gen/legacy_api.yaml | 30 +--- .../utils/code_gen/legacy_backward.yaml | 20 --- 19 files changed, 141 insertions(+), 656 deletions(-) delete mode 100644 paddle/fluid/operators/digamma_op.cc delete mode 100644 paddle/fluid/operators/digamma_op.h delete mode 100644 paddle/fluid/operators/dist_op.cc delete mode 100644 paddle/fluid/operators/dot_op.cc delete mode 100644 paddle/fluid/operators/dot_op.cu delete mode 100644 paddle/fluid/operators/dot_op.h delete mode 100644 paddle/phi/ops/compat/digamma_sig.cc delete mode 100644 paddle/phi/ops/compat/dist_sig.cc delete mode 100644 paddle/phi/ops/compat/dot_sig.cc diff --git a/paddle/fluid/operators/digamma_op.cc b/paddle/fluid/operators/digamma_op.cc deleted file mode 100644 index 5f17c3b3da658..0000000000000 --- a/paddle/fluid/operators/digamma_op.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/digamma_op.h" - -namespace paddle { -namespace operators { - -class DigammaOp : public framework::OperatorWithKernel { - public: - DigammaOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Digamma"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Digamma"); - - auto in_dims = ctx->GetInputDim("X"); - - ctx->SetOutputDim("Out", in_dims); - ctx->ShareLoD("X", "Out"); - } -}; - -class DigammaOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), The input tensor of digamma operator."); - AddOutput("Out", "(Tensor), The output tensor of digamma operator."); - AddComment(R"DOC( -Digamma Operator. - -This operator is used to perform elementwise digamma for input $X$. -$$out = \Psi(x) = \frac{ \Gamma^{'}(x) }{ \Gamma(x) }$$ - -)DOC"); - } -}; - -class DigammaGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - "Out@Grad", - "DigammaGrad"); - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DigammaGrad"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), - "Output", - "X@Grad", - "DigammaGrad"); - - auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); - ctx->SetOutputDim(framework::GradVarName("X"), dout_dims); - ctx->ShareLoD(framework::GradVarName("Out"), framework::GradVarName("X")); - } -}; - -template -class DigammaGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr retv) const override { - retv->SetType("digamma_grad"); - retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - retv->SetInput("X", this->Input("X")); - retv->SetAttrMap(this->Attrs()); - retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(digamma, - ops::DigammaOp, - ops::DigammaOpMaker, - ops::DigammaGradOpMaker, - ops::DigammaGradOpMaker); -REGISTER_OPERATOR(digamma_grad, ops::DigammaGradOp); diff --git a/paddle/fluid/operators/digamma_op.h b/paddle/fluid/operators/digamma_op.h deleted file mode 100644 index 85f9094e6a0bc..0000000000000 --- a/paddle/fluid/operators/digamma_op.h +++ /dev/null @@ -1,18 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc deleted file mode 100644 index 49f8fa75aa6ce..0000000000000 --- a/paddle/fluid/operators/dist_op.cc +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/binary.h" - -namespace paddle { -namespace operators { - -class DistOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Dist"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Dist"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Dist"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - - PADDLE_ENFORCE_NE(phi::product(x_dims), - 0, - platform::errors::InvalidArgument( - "The Input(X) has not been initialized properly. The " - "shape of Input(X) = [%s].", - x_dims)); - PADDLE_ENFORCE_NE(phi::product(y_dims), - 0, - platform::errors::InvalidArgument( - "The Input(Y) has not been initialized properly. The " - "shape of Input(Y) = [%s].", - y_dims)); - ctx->SetOutputDim("Out", {1}); - } -}; - -class DistOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "The input Tensor of Dist Op."); - AddInput("Y", "The Right-hand-side input Tensor of Dist Op."); - AddOutput("Out", - "The output of Dist Op, " - "which is the p-norm of (X - Y)"); - AddAttr("p", "the norm to be computed.").SetDefault(2.0f); - AddComment(R"DOC( -Dist Operator. -Given two tensors X and Y, compute Lp-norm of (X-Y). It is not a norm in a strict sense, -only as a measure of distance. The shapes of X and Y must be broadcastable. Where, Z = X - Y, - -When p = 0, defining $0^0 = 0$, the zero-norm of Z is simply the number of non-zero elements of z. -$$ -||Z||_{0} = \lim_{p \rightarrow 0} \sum_{i=1}^{m} |z_i|^p -$$ - -When p = inf, the inf-norm of Z is the maximum element of Z. -$$ -||Z||_\infty=\max_i |z_i| -$$ - -When p = -inf, the negative-inf-norm of Z is the minimum element of Z. -$$ -||Z||_{-\infty}=\min_i |z_i| -$$ - -Otherwise, the p-norm of Z follows the formula, -$$ -||Z||_{p} = (\sum_{i=i}^{m} |z_i|^p)^{1/p} -$$ - )DOC"); - } -}; - -class DistOpGrad : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); - } - if (ctx->HasOutput(framework::GradVarName("Y"))) { - ctx->SetOutputDim(framework::GradVarName("Y"), y_dims); - } - } -}; - -template -class DistGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType(this->ForwardOpType() + "_grad"); - op->SetInput("X", this->Input("X")); - op->SetInput("Y", this->Input("Y")); - op->SetInput("Out", this->Output("Out")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); - op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(dist, - DistInferShapeFunctor, - PD_INFER_META(phi::DistInferMeta)); - -REGISTER_OPERATOR(dist, - ops::DistOp, - ops::DistOpMaker, - ops::DistGradOpMaker, - ops::DistGradOpMaker, - DistInferShapeFunctor); -REGISTER_OPERATOR(dist_grad, ops::DistOpGrad); diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc deleted file mode 100644 index 880186b84c3a1..0000000000000 --- a/paddle/fluid/operators/dot_op.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/dot_op.h" - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/binary.h" - -namespace paddle { -namespace operators { - -class DotOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); - } -}; - -class DotOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() final { - AddInput("X", "(Tensor) The first input tensor. "); - AddInput("Y", "(Tensor) The second input tensor. "); - AddOutput("Out", "(Tensor) The result tensor."); - AddComment(""); - } -}; - -class DotGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - true, - ctx->HasInput("X"), - platform::errors::PreconditionNotMet("Input(X) should not be null.")); - PADDLE_ENFORCE_EQ( - true, - ctx->HasInput("Y"), - platform::errors::PreconditionNotMet("Input(Y) should not be null.")); - PADDLE_ENFORCE_EQ(true, - ctx->HasInput(framework::GradVarName("Out")), - platform::errors::PreconditionNotMet( - "Input(Out@GRAD) should not be null.")); - - auto x_grad_name = framework::GradVarName("X"); - auto y_grad_name = framework::GradVarName("Y"); - if (ctx->HasOutput(x_grad_name)) { - ctx->ShareDim("X", /*->*/ x_grad_name); - ctx->ShareLoD("X", /*->*/ x_grad_name); - } - if (ctx->HasOutput(y_grad_name)) { - ctx->ShareDim("Y", /*->*/ y_grad_name); - ctx->ShareLoD("Y", /*->*/ y_grad_name); - } - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); - } -}; - -template -class DotOpGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("dot_grad"); - - op->SetInput("X", this->Input("X")); - op->SetInput("Y", this->Input("Y")); - op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - op->SetAttrMap(this->Attrs()); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(dot, - DotInferShapeFunctor, - PD_INFER_META(phi::DotInferMeta)); - -REGISTER_OPERATOR(dot, - ops::DotOp, - ops::DotOpMaker, - ops::DotOpGradMaker, - ops::DotOpGradMaker, - DotInferShapeFunctor); - -REGISTER_OPERATOR(dot_grad, ops::DotGradOp); - -REGISTER_OP_CPU_KERNEL( - dot, - ops::DotKernel, - ops::DotKernel, - ops::DotKernel, - ops::DotKernel, - ops::DotKernel>, - ops::DotKernel>); -REGISTER_OP_CPU_KERNEL( - dot_grad, - ops::DotGradKernel, - ops::DotGradKernel, - ops::DotGradKernel, - ops::DotGradKernel, - ops::DotGradKernel>, - ops::DotGradKernel>); diff --git a/paddle/fluid/operators/dot_op.cu b/paddle/fluid/operators/dot_op.cu deleted file mode 100644 index 362a6a80f96fe..0000000000000 --- a/paddle/fluid/operators/dot_op.cu +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/dot_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - dot, - ops::DotKernel, - ops::DotKernel, - ops::DotKernel, - ops::DotKernel, - ops::DotKernel>, - ops::DotKernel>); -REGISTER_OP_CUDA_KERNEL(dot_grad, - ops::DotGradKernel, - ops::DotGradKernel, - ops::DotGradKernel, - ops::DotGradKernel, - ops::DotGradKernel>, - ops::DotGradKernel>); diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h deleted file mode 100644 index 0f4c80c4c9e07..0000000000000 --- a/paddle/fluid/operators/dot_op.h +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -// only can include the headers in paddle/phi/api dirs -#include "paddle/phi/api/lib/utils/tensor_utils.h" -#include "paddle/phi/kernels/dot_grad_kernel.h" -#include "paddle/phi/kernels/dot_kernel.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// See Note [ Why still keep the original kernel implementation? ] -template -class DotKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - auto& dev_ctx = ctx.device_context(); - out->mutable_data(x->place()); - - // call new kernel - phi::DotKernel< - T, - typename paddle::framework::ConvertToPhiContext::TYPE>( - static_cast::TYPE&>(dev_ctx), - *x, - *y, - out); - } -}; - -template -class DotGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* tensor_x = ctx.Input("X"); - auto* tensor_y = ctx.Input("Y"); - auto* tensor_dout = ctx.Input(framework::GradVarName("Out")); - auto* tensor_dx = ctx.Output(framework::GradVarName("X")); - auto* tensor_dy = ctx.Output(framework::GradVarName("Y")); - - if (tensor_dx) tensor_dx->mutable_data(ctx.GetPlace()); - if (tensor_dy) tensor_dy->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.device_context(); - - // call new kernel - phi::DotGradKernel( - static_cast::TYPE&>(dev_ctx), - *tensor_x, - *tensor_y, - *tensor_dout, - tensor_dx, - tensor_dy); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h index 36267b9f9a391..8e436dd6afbfb 100644 --- a/paddle/fluid/operators/matmul_v2_op.h +++ b/paddle/fluid/operators/matmul_v2_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/dot_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/complex_functors.h" diff --git a/paddle/phi/kernels/digamma_kernel.h b/paddle/phi/kernels/digamma_kernel.h index 3cf1eae67cc3e..b45b7070d2dee 100644 --- a/paddle/phi/kernels/digamma_kernel.h +++ b/paddle/phi/kernels/digamma_kernel.h @@ -18,6 +18,13 @@ namespace phi { +/** + * @brief This kernrel is used to perform elementwise digamma for x. + * $$out = \Psi(x) = \frac{ \Gamma^{'}(x) }{ \Gamma(x) }$$ + * @param ctx device context + * @param x the input tensor of digamma + * @param out the output tensor of digamma + */ template void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); diff --git a/paddle/phi/kernels/dist_kernel.h b/paddle/phi/kernels/dist_kernel.h index 6cb3d6e0e8bef..8c1f6674aa5b5 100644 --- a/paddle/phi/kernels/dist_kernel.h +++ b/paddle/phi/kernels/dist_kernel.h @@ -18,6 +18,37 @@ namespace phi { +/** + * @brief Given two tensors x and y, compute Lp-norm of (x-y). + * It is not a norm in a strict sense, only as a measure of distance. + * The shapes of x and y must be broadcastable. Where, z = x - y, + * + * When p = 0, defining $0^0 = 0$, the zero-norm of z is simply + * the number of non-zero elements of z. + * $$ + * ||z||_{0} = \lim_{p \rightarrow 0} \sum_{i=1}^{m} |z_i|^p + * $$ + * + * When p = inf, the inf-norm of z is the maximum element of z. + * $$ + * ||z||_\infty=\max_i |z_i| + * $$ + * + * When p = -inf, the negative-inf-norm of z is the minimum element of z. + * $$ + * ||z||_{-\infty}=\min_i |z_i| + * $$ + * + * Otherwise, the p-norm of z follows the formula, + * $$ + * ||z||_{p} = (\sum_{i=i}^{m} |z_i|^p)^{1/p} + * $$ + * @param ctx device context + * @param x the input Tensor of Dist + * @param y the Right-hand-side input Tensor of Dist + * @param p the norm to be computed + * @param out the output of Dist, which is the p-norm of (x - y) + */ template void DistKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/kernels/erfinv_kernel.h b/paddle/phi/kernels/erfinv_kernel.h index 8380a62971ba4..3ddb1ecbdfd80 100644 --- a/paddle/phi/kernels/erfinv_kernel.h +++ b/paddle/phi/kernels/erfinv_kernel.h @@ -18,6 +18,18 @@ namespace phi { +/** + * @brief This kernel is used to compute inverse error function of x. + * + * The equation is: + * $$erfinv(x) = {ndtri({x \over 2} + 0.5)} \over {\sqrt{2}}$$ + * + * The input `x` can carry the LoD (Level of Details) information, + * or not. And the output shares the LoD information with `x` + * @param ctx device context + * @param x the input tensor of erfinv + * @param out the output tensor of erfinv + */ template void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); diff --git a/paddle/phi/ops/compat/digamma_sig.cc b/paddle/phi/ops/compat/digamma_sig.cc deleted file mode 100644 index 6c14dd9bf1744..0000000000000 --- a/paddle/phi/ops/compat/digamma_sig.cc +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature DigammaGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("digamma_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(digamma_grad, phi::DigammaGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/dist_sig.cc b/paddle/phi/ops/compat/dist_sig.cc deleted file mode 100644 index cc702fefbc940..0000000000000 --- a/paddle/phi/ops/compat/dist_sig.cc +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature DistGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "dist_grad", {"X", "Y", "Out", "Out@GRAD"}, {"p"}, {"X@GRAD", "Y@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(dist_grad, phi::DistGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/dot_sig.cc b/paddle/phi/ops/compat/dot_sig.cc deleted file mode 100644 index 2187a7eb4fca0..0000000000000 --- a/paddle/phi/ops/compat/dot_sig.cc +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature DotGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "dot_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(dot_grad, phi::DotGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_dot_op.py b/python/paddle/fluid/tests/unittests/test_dot_op.py index 536f8fd8d8af7..ffdc90dd986ad 100644 --- a/python/paddle/fluid/tests/unittests/test_dot_op.py +++ b/python/paddle/fluid/tests/unittests/test_dot_op.py @@ -27,6 +27,7 @@ class DotOp(OpTest): def setUp(self): self.op_type = "dot" + self.python_api = paddle.dot self.init_dtype() self.init_input_output() @@ -38,34 +39,43 @@ def setUp(self): self.attrs = {} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad_normal(self): if core.is_compiled_with_rocm(): self.check_grad( ['X', 'Y'], 'Out', - user_defined_grads=[self.inputs['Y'], self.inputs['X']]) + user_defined_grads=[self.inputs['Y'], self.inputs['X']], + check_eager=True) else: - self.check_grad(['X', 'Y'], 'Out') + self.check_grad(['X', 'Y'], 'Out', check_eager=True) def test_check_grad_ingore_x(self): if core.is_compiled_with_rocm(): self.check_grad(['Y'], 'Out', no_grad_set=set("X"), - user_defined_grads=[self.inputs['X']]) + user_defined_grads=[self.inputs['X']], + check_eager=True) else: - self.check_grad(['Y'], 'Out', no_grad_set=set("X")) + self.check_grad(['Y'], + 'Out', + no_grad_set=set("X"), + check_eager=True) def test_check_grad_ingore_y(self): if core.is_compiled_with_rocm(): self.check_grad(['X'], 'Out', no_grad_set=set('Y'), - user_defined_grads=[self.inputs['Y']]) + user_defined_grads=[self.inputs['Y']], + check_eager=True) else: - self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + self.check_grad(['X'], + 'Out', + no_grad_set=set('Y'), + check_eager=True) def init_input_output(self): self.x = np.random.uniform(0.1, 1, [121]).astype(self.dtype) @@ -137,6 +147,7 @@ class TestComplexDotOp(OpTest): def setUp(self): self.op_type = "dot" + self.python_api = paddle.dot self.init_base_dtype() self.init_input_output() self.init_grad_input_output() @@ -164,27 +175,30 @@ def init_grad_input_output(self): self.grad_y = self.grad_out * np.conj(self.x) def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad_normal(self): self.check_grad(['X', 'Y'], 'Out', user_defined_grads=[self.grad_x, self.grad_y], - user_defined_grad_outputs=[self.grad_out]) + user_defined_grad_outputs=[self.grad_out], + check_eager=True) def test_check_grad_ingore_x(self): self.check_grad(['Y'], 'Out', no_grad_set=set("X"), user_defined_grads=[self.grad_y], - user_defined_grad_outputs=[self.grad_out]) + user_defined_grad_outputs=[self.grad_out], + check_eager=True) def test_check_grad_ingore_y(self): self.check_grad(['X'], 'Out', no_grad_set=set('Y'), user_defined_grads=[self.grad_x], - user_defined_grad_outputs=[self.grad_out]) + user_defined_grad_outputs=[self.grad_out], + check_eager=True) class TestComplexDotOp2D(OpTest): diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index c704a1b52d14e..95eaee2cc0356 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -1017,11 +1017,12 @@ def dot(x, y, name=None): print(z) """ + if in_dygraph_mode(): + return _C_ops.final_state_dot(x, y) + if _in_legacy_dygraph(): + return _C_ops.dot(x, y) + op_type = 'dot' - # skip var type check in dygraph mode to improve efficiency - if paddle.in_dynamic_mode(): - op = getattr(_C_ops, op_type) - return op(x, y) assert x is not None, 'x cannot be None in {}'.format(op_type) assert y is not None, 'y cannot be None in {}'.format(op_type) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 500ea7b7adc6d..0f86c93d9314e 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -52,6 +52,34 @@ func : diagonal backward : diagonal_grad +- api : digamma + args : (Tensor x) + output : Tensor + infer_meta : + func : UnchangedInferMeta + kernel : + func : digamma + backward : digamma_grad + +- api : dist + args : (Tensor x, Tensor y, float p = 2.0) + output : Tensor + infer_meta : + func : DistInferMeta + kernel : + func : dist + backward : dist_grad + +- api : dot + args : (Tensor x, Tensor y) + output : Tensor + infer_meta : + func : DotInferMeta + kernel : + func : dot + data_type : x + backward : dot_grad + - api : erf args : (Tensor x) output : Tensor diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 32c6e2c4b63ef..32906ce382742 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -51,6 +51,37 @@ data_type : out_grad no_need_buffer : x +- backward_api : digamma_grad + forward : digamma (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : digamma_grad + +- backward_api : dist_grad + forward : dist (Tensor x, Tensor y, float p) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, float p) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : dist_grad + +- backward_api : dot_grad + forward : dot (Tensor x, Tensor y) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : dot_grad + data_type : out_grad + - backward_api : erf_grad forward : erf (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) diff --git a/python/paddle/utils/code_gen/legacy_api.yaml b/python/paddle/utils/code_gen/legacy_api.yaml index 8d20833c652cc..c307fc7a19d5d 100644 --- a/python/paddle/utils/code_gen/legacy_api.yaml +++ b/python/paddle/utils/code_gen/legacy_api.yaml @@ -497,24 +497,6 @@ kernel : func : diag -- api : digamma - args : (Tensor x) - output : Tensor - infer_meta : - func : UnchangedInferMeta - kernel : - func : digamma - backward : digamma_grad - -- api : dist - args : (Tensor x, Tensor y, float p) - output : Tensor - infer_meta : - func : DistInferMeta - kernel : - func : dist - backward : dist_grad - - api : divide args : (Tensor x, Tensor y) output : Tensor @@ -524,14 +506,6 @@ func : divide backward : divide_grad -- api : dot - args : (Tensor x, Tensor y) - output : Tensor - infer_meta : - func : DotInferMeta - kernel : - func : dot - - api : dropout args : (Tensor x, Tensor seed_tensor, float p, bool is_test, str mode, int seed, bool fix_seed) output : Tensor(out), Tensor(mask) @@ -629,14 +603,14 @@ kernel : func : equal_all -# erfinv - api : erfinv args : (Tensor x) - output : Tensor + output : Tensor(out) infer_meta : func : UnchangedInferMeta kernel : func : erfinv + inplace : (x -> out) backward : erfinv_grad # exp diff --git a/python/paddle/utils/code_gen/legacy_backward.yaml b/python/paddle/utils/code_gen/legacy_backward.yaml index 16d58fde77ffe..a4589120cc475 100644 --- a/python/paddle/utils/code_gen/legacy_backward.yaml +++ b/python/paddle/utils/code_gen/legacy_backward.yaml @@ -498,26 +498,6 @@ kernel : func : determinant_grad -- backward_api : digamma_grad - forward : digamma (Tensor x) -> Tensor(out) - args : (Tensor x, Tensor out_grad) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : digamma_grad - -- backward_api : dist_grad - forward : dist (Tensor x, Tensor y, float p) -> Tensor(out) - args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, float p) - output : Tensor(x_grad), Tensor(y_grad) - infer_meta : - func : GeneralBinaryGradInferMeta - param : [x, y] - kernel : - func : dist_grad - - backward_api : divide_double_grad forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y) args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1) From 7499f9613114b903866ef105489b79a41bf1b837 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Fri, 1 Jul 2022 10:09:27 +0800 Subject: [PATCH 016/250] Switch eager mode to default dygraph mode (#43767) --- python/paddle/fluid/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 3d88a1377a056..df4691d49e891 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -75,7 +75,7 @@ CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() _dygraph_tracer_ = None -_in_eager_mode_ = (os.environ.get('FLAGS_enable_eager_mode') == '1') +_in_eager_mode_ = (os.environ.get('FLAGS_enable_eager_mode', '1') == '1') _global_expected_place_ = None _current_device = None global_prog_seed = 0 From 9f397f16a999a054a4b349057ef05d176fe603e2 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Fri, 1 Jul 2022 10:34:16 +0800 Subject: [PATCH 017/250] filter whitespace/parent since clang-format has similar function (#43978) --- tools/codestyle/cpplint_pre_commit.hook | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook index cef11ab1351b7..3c584a440ee1f 100755 --- a/tools/codestyle/cpplint_pre_commit.hook +++ b/tools/codestyle/cpplint_pre_commit.hook @@ -24,7 +24,7 @@ for file in $files; do if [[ $file =~ ^(patches/.*) ]]; then continue; else - cpplint --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11 $file; + cpplint --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); fi done From 3a59ede964922aa009e4bbbf7ebee36798e4a72c Mon Sep 17 00:00:00 2001 From: Chenxiao Niu Date: Fri, 1 Jul 2022 10:49:50 +0800 Subject: [PATCH 018/250] [MLU] add rnn backward kernel. (#43969) --- paddle/fluid/operators/mlu/mlu_baseop.cc | 82 ++++ paddle/fluid/operators/mlu/mlu_baseop.h | 24 ++ paddle/fluid/operators/rnn_op_mlu.cc | 397 +++++++++++++++++- .../tests/unittests/mlu/test_rnn_op_mlu.py | 55 +-- 4 files changed, 522 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 972bdefdf02b8..5531250f363b5 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -4616,6 +4616,88 @@ MLURNNDesc::~MLURNNDesc() { reservespace_size)); } +/* static */ void MLUCnnl::RNNBackward(const ExecutionContext& ctx, + const cnnlRNNDescriptor_t rnn_desc, + cnnlWgradMode_t add_grad, + const int dev_seq_lengths[], + const void* weight_param_ptr, + void* dweight_param_ptr, + size_t weightspace_size, + const cnnlSeqDataDescriptor_t x_desc, + const void* x, + void* dx, + const cnnlSeqDataDescriptor_t y_desc, + const void* y, + const void* dy, + const cnnlTensorDescriptor_t hx_desc, + const void* hx, + const void* dhy, + void* dhx, + const cnnlTensorDescriptor_t cx_desc, + const void* cx, + const void* dcy, + void* dcx, + void* reservespace_ptr, + size_t reservespace_size) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_NOT_NULL( + rnn_desc, + paddle::platform::errors::Fatal( + "MLU RNNForward failed. rnn_desc initializing failed.")); + PADDLE_ENFORCE_NOT_NULL( + x_desc, + paddle::platform::errors::Fatal( + "MLU RNNForward failed. x_desc initializing failed.")); + auto& dev_ctx = GetDevCtxFromCTX(ctx); + size_t workspace_size; + Tensor workspace; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNTempSizes( + handle, rnn_desc, x_desc, &workspace_size, &reservespace_size)); + workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlRNNBackwardData(handle, + rnn_desc, + dev_seq_lengths, + y_desc, + y, + dy, + x_desc, + dx, + hx_desc, + hx, + dhy, + dhx, + cx_desc, + cx, + dcy, + dcx, + weight_param_ptr, + weightspace_size, + workspace_ptr, + workspace_size, + reservespace_ptr, + reservespace_size)); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlRNNBackwardWeights(handle, + rnn_desc, + add_grad, + dev_seq_lengths, + x_desc, + x, + hx_desc, + hx, + y_desc, + y, + dweight_param_ptr, + weightspace_size, + workspace_ptr, + workspace_size, + reservespace_ptr, + reservespace_size)); +} + /* static */ void MLUCnnl::Mask(const ExecutionContext& ctx, cnnlMaskedOp_t masked_mode, const cnnlTensorDescriptor_t input_desc, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 85f4439c3b974..07c5031ee2eb1 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -1924,6 +1924,30 @@ class MLUCnnl { void* cy, void* reservespace_ptr); + static void RNNBackward(const ExecutionContext& ctx, + const cnnlRNNDescriptor_t rnn_desc, + cnnlWgradMode_t add_grad, + const int dev_seq_lengths[], + const void* weight_param_ptr, + void* dweight_param_ptr, + size_t weightspace_size, + const cnnlSeqDataDescriptor_t x_desc, + const void* x, + void* dx, + const cnnlSeqDataDescriptor_t y_desc, + const void* y, + const void* dy, + const cnnlTensorDescriptor_t hx_desc, + const void* hx, + const void* dhy, + void* dhx, + const cnnlTensorDescriptor_t cx_desc, + const void* cx, + const void* dcy, + void* dcx, + void* reservespace_ptr, + size_t reservespace_size); + static void Mask(const ExecutionContext& ctx, cnnlMaskedOp_t masked_mode, const cnnlTensorDescriptor_t input_desc, diff --git a/paddle/fluid/operators/rnn_op_mlu.cc b/paddle/fluid/operators/rnn_op_mlu.cc index 653c50c83b83e..fe567333b6d40 100644 --- a/paddle/fluid/operators/rnn_op_mlu.cc +++ b/paddle/fluid/operators/rnn_op_mlu.cc @@ -28,7 +28,7 @@ void reset_parameter_vector( const std::vector& raw_params_vec, const int& num_layers, const bool& is_bidirec, - std::vector>>* params_vec) { + std::vector>>* params_vec) { // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers @@ -47,7 +47,8 @@ void reset_parameter_vector( } using remove_cv_t = typename std::remove_cv::type; params_vec->at(i)[j] = std::make_pair( - raw_params_vec[tensor_idx]->template data(), + const_cast( + raw_params_vec[tensor_idx]->template data()), raw_params_vec[tensor_idx]->numel() * sizeof(T)); } } @@ -66,7 +67,6 @@ class RNNMLUKernel : public framework::OpKernel { // Output auto state = ctx.MultiOutput("State"); auto* output = ctx.Output("Out"); - // auto* dropout_mask = ctx.Output("DropoutState"); auto* reserve_data = ctx.Output("Reserve"); // Attributes const int& num_layers = ctx.Attr("num_layers"); @@ -79,14 +79,6 @@ class RNNMLUKernel : public framework::OpKernel { sequence_length = ctx.Input("SequenceLength"); } - // if (dropout_mask->IsInitialized()) { - // if (dropout_mask->numel() != output->numel()) dropout_mask->clear(); - // } - // dropout_mask->mutable_data(output->dims(), ctx.GetPlace()); - // auto& dev_ctx = ctx.template device_context(); - // phi::funcs::SetConstant ones; - // ones(dev_ctx, dropout_mask, static_cast(1)); - auto init_h = pre_state[0]; // -> hx auto init_c = pre_state[1]; // -> cx auto last_h = state[0]; @@ -143,7 +135,7 @@ class RNNMLUKernel : public framework::OpKernel { init_c->dims()[0])); // weightlist - std::vector>> parameter_lists; + std::vector>> parameter_lists; parameter_lists.resize(num_layers); reset_parameter_vector( weight_list, num_layers, is_bidirec, ¶meter_lists); @@ -363,9 +355,390 @@ class RNNMLUKernel : public framework::OpKernel { } }; +template +class RNNMLUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + auto stream = ctx.template device_context().stream(); + // get the tensor pointer for the input + auto* input = ctx.Input("Input"); + auto pre_state = ctx.MultiInput("PreState"); + auto weight_list = ctx.MultiInput("WeightList"); + auto* output = ctx.Input("Out"); + auto* reserve_data = ctx.Input("Reserve"); + const int& num_layers = ctx.Attr("num_layers"); + const bool& is_bidirec = ctx.Attr("is_bidirec"); + const int& hidden_size = ctx.Attr("hidden_size"); + const std::string& mode = ctx.Attr("mode"); + + bool has_seq_length = ctx.HasInput("SequenceLength"); + const Tensor* sequence_length = nullptr; + if (has_seq_length) { + sequence_length = ctx.Input("SequenceLength"); + } + + PADDLE_ENFORCE_EQ( + mode, + "LSTM", + platform::errors::InvalidArgument( + "XPU only support LSTM mode now, current mode is %s", mode)); + + auto init_h = pre_state[0]; // -> hx + auto init_c = pre_state[1]; // -> cx + + auto output_grad = ctx.Input(framework::GradVarName("Out")); + auto state_grad = ctx.MultiInput(framework::GradVarName("State")); + auto last_h_grad = state_grad[0]; // -> dhy + auto last_c_grad = state_grad[1]; // -> dcy + + // get the tensor pointer for the output + auto* input_grad = ctx.Output(framework::GradVarName("Input")); + auto weight_grad_list = ctx.MultiOutput( + framework::GradVarName("WeightList")); + auto pre_state_grad = + ctx.MultiOutput(framework::GradVarName("PreState")); + Tensor* init_h_grad = nullptr; + Tensor* init_c_grad = nullptr; + if (pre_state_grad.size() > 0) { // has gradient + init_h_grad = pre_state_grad[0]; // -> dhx + init_c_grad = pre_state_grad[1]; // -> dcx + } + + // check shape + const int in_out_dim_num = input->dims().size(); + const int& seq_len = input->dims()[0]; + const int& batch_size = input->dims()[1]; + const int& input_dim = input->dims()[2]; + const int& direction_num = is_bidirec ? 2 : 1; + int in_dim_arr[in_out_dim_num] = {seq_len, batch_size, input_dim}; + int out_dim_arr[in_out_dim_num] = { + seq_len, batch_size, direction_num * hidden_size}; + int proj_size = hidden_size; + PADDLE_ENFORCE_EQ( + num_layers, + 1, + platform::errors::InvalidArgument( + "MLU only support 1 num_layers, current num_layers is %s", + num_layers)); + PADDLE_ENFORCE_EQ( + init_h->dims()[0], + num_layers * direction_num, + platform::errors::InvalidArgument("The num_layers of in RNN layer must" + " be the same as first dim of init" + "hidden, but received num_layers:%d," + " dim:%d", + num_layers, + init_h->dims()[0])); + PADDLE_ENFORCE_EQ( + init_c->dims()[0], + num_layers * direction_num, + platform::errors::InvalidArgument( + "The num_layers of in RNN layer must" + " be the same as first dim of cell state hidden, but received" + " num_layers:%d, dim:%d", + num_layers, + init_c->dims()[0])); + + std::vector>> parameter_lists; + parameter_lists.resize(num_layers); + reset_parameter_vector( + weight_list, num_layers, is_bidirec, ¶meter_lists); + + for (unsigned int i = 0; i < weight_grad_list.size(); ++i) { + weight_grad_list[i]->mutable_data(ctx.GetPlace()); + } + std::vector>> parameter_lists_grad; + parameter_lists_grad.resize(num_layers); + reset_parameter_vector( + weight_grad_list, num_layers, is_bidirec, ¶meter_lists_grad); + + // allocate the memory and initization the input_grad + input_grad->mutable_data(input->dims(), ctx.GetPlace()); + FillMLUTensorWithHostValue(ctx, static_cast(0.0), input_grad); + + Tensor a, b; + Tensor* dynamic_grad_pre_h = &a; + Tensor* dynamic_grad_pre_c = &b; + if (init_h_grad) { + init_h_grad->mutable_data(last_h_grad->dims(), ctx.GetPlace()); + FillMLUTensorWithHostValue(ctx, static_cast(0.0), init_h_grad); + } else { + dynamic_grad_pre_h->Resize(last_h_grad->dims()); + dynamic_grad_pre_h->mutable_data(ctx.GetPlace()); + FillMLUTensorWithHostValue(ctx, static_cast(0.0), dynamic_grad_pre_h); + init_h_grad = dynamic_grad_pre_h; + } + if (init_c_grad) { + init_c_grad->mutable_data(last_c_grad->dims(), ctx.GetPlace()); + } else { + dynamic_grad_pre_c->Resize(last_h_grad->dims()); + dynamic_grad_pre_c->mutable_data(ctx.GetPlace()); + init_c_grad = dynamic_grad_pre_c; + } + + std::vector seq_len_vec(batch_size, seq_len); + if (has_seq_length) { + seq_len_vec = operators::GetDataFromTensor(sequence_length); + } + cnnlDirectionMode_t direction = + is_bidirec ? CNNL_RNN_BIDIRECTIONAL : CNNL_RNN_UNIDIRECTIONAL; + + MLUSeqDataDesc input_seq_data_desc(CNNL_SEQDATA_TNC, + ToCnnlDataType(input->dtype()), + in_out_dim_num, + in_dim_arr, + static_cast(seq_len_vec.size()), + seq_len_vec.data(), + nullptr); + MLUSeqDataDesc out_seq_data_desc(CNNL_SEQDATA_TNC, + ToCnnlDataType(input->dtype()), + in_out_dim_num, + out_dim_arr, + static_cast(seq_len_vec.size()), + seq_len_vec.data(), + nullptr); + MLUCnnlTensorDesc hx_desc(*init_h); + MLUCnnlTensorDesc cx_desc(*init_c); + MLURNNDesc rnn_desc(CNNL_LSTM, + CNNL_RNN_DOUBLE_BIAS, + direction, + CNNL_RNN_LINEAR_INPUT, + ToCnnlDataType(input->dtype()), + ToCnnlDataType(input->dtype()), + input_dim, + hidden_size, + /*projection*/ proj_size, + num_layers, + nullptr, + CNNL_RNN_PADDED_IO_DISABLED); + rnn_desc.SetRNNMaskMode(CNNL_LSTM_MASK_ENABLED); + + // copy weight + size_t weightspace_size; + framework::Tensor weightspace, dweightspace; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetRNNWeightSpaceSize( + GetHandleFromCTX(ctx), rnn_desc.get(), &weightspace_size)); + + weightspace = ctx.AllocateTmpTensor( + {static_cast(weightspace_size)}, dev_ctx); + dweightspace = ctx.AllocateTmpTensor( + {static_cast(weightspace_size)}, dev_ctx); + void* weightspace_ptr = weightspace.mutable_data(ctx.GetPlace()); + auto w_x = parameter_lists[0][0]; + auto w_h = parameter_lists[0][1]; + auto b_x = parameter_lists[0][2]; + auto b_h = parameter_lists[0][3]; + auto actual_total_w_size = + w_x.second + w_h.second + b_x.second + b_h.second; + + void* w_x_ptr = weightspace_ptr; + void* w_h_ptr = static_cast(weightspace_ptr) + w_x.second; + void* b_x_ptr = + static_cast(weightspace_ptr) + w_x.second + w_h.second; + void* b_h_ptr = static_cast(weightspace_ptr) + w_x.second + + w_h.second + b_x.second; + + memory::Copy(weightspace.place(), + w_x_ptr, + weightspace.place(), + w_x.first, + w_x.second, + stream); + memory::Copy(weightspace.place(), + w_h_ptr, + weightspace.place(), + w_h.first, + w_h.second, + stream); + memory::Copy(weightspace.place(), + b_x_ptr, + weightspace.place(), + b_x.first, + b_x.second, + stream); + memory::Copy(weightspace.place(), + b_h_ptr, + weightspace.place(), + b_h.first, + b_h.second, + stream); + + if (is_bidirec) { + auto bw_x = parameter_lists[0][4]; + auto bw_h = parameter_lists[0][5]; + auto bb_x = parameter_lists[0][6]; + auto bb_h = parameter_lists[0][7]; + void* bw_x_ptr = + static_cast(weightspace_ptr) + actual_total_w_size; + void* bw_h_ptr = static_cast(weightspace_ptr) + + actual_total_w_size + bw_x.second; + void* bb_x_ptr = static_cast(weightspace_ptr) + + actual_total_w_size + bw_x.second + bw_h.second; + void* bb_h_ptr = static_cast(weightspace_ptr) + + actual_total_w_size + bw_x.second + bw_h.second + + bb_x.second; + actual_total_w_size += + bw_x.second + bw_h.second + bb_x.second + bb_h.second; + + memory::Copy(weightspace.place(), + bw_x_ptr, + weightspace.place(), + bw_x.first, + bw_x.second, + stream); + memory::Copy(weightspace.place(), + bw_h_ptr, + weightspace.place(), + bw_h.first, + bw_h.second, + stream); + memory::Copy(weightspace.place(), + bb_x_ptr, + weightspace.place(), + bb_x.first, + bb_x.second, + stream); + memory::Copy(weightspace.place(), + bb_h_ptr, + weightspace.place(), + bb_h.first, + bb_h.second, + stream); + } + dev_ctx.Wait(); + + PADDLE_ENFORCE_EQ(weightspace_size, + actual_total_w_size, + platform::errors::InvalidArgument( + "The weightsize doesn't match" + " weightspace_size:%d, actual_total_w_size:%d", + weightspace_size, + actual_total_w_size)); + + MLUCnnl::RNNBackward(ctx, + rnn_desc.get(), + CNNL_WGRAD_MODE_SET, + seq_len_vec.data(), + GetBasePtr(&weightspace), + GetBasePtr(&dweightspace), + weightspace.numel() * sizeof(T), + input_seq_data_desc.get(), + GetBasePtr(input), + GetBasePtr(input_grad), + out_seq_data_desc.get(), + GetBasePtr(output), + GetBasePtr(output_grad), + hx_desc.get(), + GetBasePtr(init_h), + GetBasePtr(last_h_grad), + GetBasePtr(init_h_grad), + cx_desc.get(), + GetBasePtr(init_c), + GetBasePtr(last_c_grad), + GetBasePtr(init_c_grad), + const_cast(GetBasePtr(reserve_data)), + reserve_data->numel() * sizeof(T)); + + void* dweightspace_ptr = dweightspace.mutable_data(ctx.GetPlace()); + auto dw_x = parameter_lists_grad[0][0]; + auto dw_h = parameter_lists_grad[0][1]; + auto db_x = parameter_lists_grad[0][2]; + auto db_h = parameter_lists_grad[0][3]; + auto dactual_total_w_size = + dw_x.second + dw_h.second + db_x.second + db_h.second; + + void* dw_x_ptr = dweightspace_ptr; + void* dw_h_ptr = static_cast(dweightspace_ptr) + dw_x.second; + void* db_x_ptr = + static_cast(dweightspace_ptr) + dw_x.second + dw_h.second; + void* db_h_ptr = static_cast(dweightspace_ptr) + dw_x.second + + dw_h.second + db_x.second; + + memory::Copy(weightspace.place(), + dw_x.first, + weightspace.place(), + dw_x_ptr, + dw_x.second, + stream); + memory::Copy(weightspace.place(), + dw_h.first, + weightspace.place(), + dw_h_ptr, + dw_h.second, + stream); + memory::Copy(weightspace.place(), + db_x.first, + weightspace.place(), + db_x_ptr, + db_x.second, + stream); + memory::Copy(weightspace.place(), + db_h.first, + weightspace.place(), + db_h_ptr, + db_h.second, + stream); + + if (is_bidirec) { + auto dbw_x = parameter_lists_grad[0][4]; + auto dbw_h = parameter_lists_grad[0][5]; + auto dbb_x = parameter_lists_grad[0][6]; + auto dbb_h = parameter_lists_grad[0][7]; + void* dbw_x_ptr = + static_cast(dweightspace_ptr) + dactual_total_w_size; + void* dbw_h_ptr = static_cast(dweightspace_ptr) + + dactual_total_w_size + dbw_x.second; + void* dbb_x_ptr = static_cast(dweightspace_ptr) + + dactual_total_w_size + dbw_x.second + dbw_h.second; + void* dbb_h_ptr = static_cast(dweightspace_ptr) + + dactual_total_w_size + dbw_x.second + dbw_h.second + + dbb_x.second; + dactual_total_w_size += + dbw_x.second + dbw_h.second + dbb_x.second + dbb_h.second; + + memory::Copy(weightspace.place(), + dbw_x.first, + weightspace.place(), + dbw_x_ptr, + dbw_x.second, + stream); + memory::Copy(weightspace.place(), + dbw_h.first, + weightspace.place(), + dbw_h_ptr, + dbw_h.second, + stream); + memory::Copy(weightspace.place(), + dbb_x.first, + weightspace.place(), + dbb_x_ptr, + dbb_x.second, + stream); + memory::Copy(weightspace.place(), + dbb_h.first, + weightspace.place(), + dbb_h_ptr, + dbb_h.second, + stream); + } + dev_ctx.Wait(); + + PADDLE_ENFORCE_EQ(weightspace_size, + dactual_total_w_size, + platform::errors::InvalidArgument( + "The weightsize doesn't match" + " weightspace_size:%d, dactual_total_w_size:%d", + weightspace_size, + dactual_total_w_size)); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_MLU_KERNEL( rnn, ops::RNNMLUKernel); +REGISTER_OP_MLU_KERNEL( + rnn_grad, ops::RNNMLUGradKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_rnn_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_rnn_op_mlu.py index f1aabbd3b603b..917597daf3a1d 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_rnn_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_rnn_op_mlu.py @@ -135,43 +135,50 @@ def init_size(self): def test_output(self): self.check_output_with_place( - self.place, no_check_set=['Reserve', 'DropoutState', 'State']) + self.place, + atol=1e-4, + no_check_set=['Reserve', 'DropoutState', 'State']) def set_attrs(self): pass - # def test_grad(self): - # if not self.is_test: - # var_name_list = self.get_weight_names() - # grad_check_list = ['Input', 'init_h', 'init_c'] - # grad_check_list.extend(var_name_list) - # self.check_grad_with_place(self.place, set(grad_check_list), - # ['Out', 'last_hidden', 'last_cell']) + def test_grad(self): + if not self.is_test and self.sequence_length is None: + # if not self.is_test: + var_name_list = self.get_weight_names() + grad_check_list = ['Input', 'init_h', 'init_c'] + grad_check_list.extend(var_name_list) + self.check_grad_with_place(self.place, set(grad_check_list), + ['Out', 'last_hidden', 'last_cell']) -# class TestRNNOp1(TestRNNOp): +class TestRNNOp1(TestRNNOp): -# def set_attrs(self): -# self.sequence_length = None + def set_attrs(self): + self.sequence_length = None -# class TestRNNOp2(TestRNNOp): -# def set_attrs(self): -# self.sequence_length = None -# self.is_bidirec = True +class TestRNNOp2(TestRNNOp): -# class TestRNNOp3(TestRNNOp): + def set_attrs(self): + self.sequence_length = None + self.is_bidirec = True -# def set_attrs(self): -# self.is_test = True -# self.sequence_length = None -# class TestRNNOp4(TestRNNOp): +class TestRNNOp3(TestRNNOp): + + def set_attrs(self): + self.is_test = True + self.sequence_length = None + + +class TestRNNOp4(TestRNNOp): + + def set_attrs(self): + self.is_test = True + self.sequence_length = None + self.is_bidirec = True -# def set_attrs(self): -# self.is_test = True -# self.sequence_length = None -# self.is_bidirec = True #TODO(chenxiao): cnnl doesn't support num_layers > 1 case # class TestRNNOp5(TestRNNOp): From 88e27a079953c616659de3804550b6e5425cdf66 Mon Sep 17 00:00:00 2001 From: Lux et Veritas <1004239791@qq.com> Date: Fri, 1 Jul 2022 10:50:20 +0800 Subject: [PATCH 019/250] [MLU] add mlu kernel for fill_constant_batch_size_like (#43820) --- .../fill_constant_batch_size_like_op_mlu.cc | 99 ++++++++ ...st_fill_constant_batch_size_like_op_mlu.py | 219 ++++++++++++++++++ 2 files changed, 318 insertions(+) create mode 100644 paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_fill_constant_batch_size_like_op_mlu.py diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc new file mode 100644 index 0000000000000..425222bcd660c --- /dev/null +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/fill_constant_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/utils.h" + +namespace paddle { +namespace operators { +template +class FillConstantBatchSizeLikeOpMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto data_type = + static_cast(ctx.Attr("dtype")); + auto float_value = ctx.Attr("value"); + auto str_value = ctx.Attr("str_value"); + auto force_cpu = ctx.Attr("force_cpu"); + + auto *out = ctx.Output("Out"); + auto *in = ctx.Input("Input"); + if (in->lod().size() && ctx.Attr("input_dim_idx") == 0) { + // set the correct batch size for the LoDTensor. + auto odims = out->dims(); + int output_dim_idx = ctx.Attr("output_dim_idx"); + odims[output_dim_idx] = static_cast(in->lod().back().size()) - 1; + out->mutable_data(odims, ctx.GetPlace()); + } + + T value; + if (str_value.empty()) { + value = static_cast(float_value); + } else { + // handle NaN/Inf first, which cannot be read from stream. + if (str_value == "inf") { + value = static_cast(std::numeric_limits::infinity()); + } else if (str_value == "-inf") { + value = static_cast(-std::numeric_limits::infinity()); + } else if (str_value == "nan") { + value = static_cast(std::numeric_limits::quiet_NaN()); + } else { + std::stringstream convert_stream(str_value); + if (std::is_same::value) { + int64_t tmp_value; + convert_stream >> tmp_value; + value = static_cast(tmp_value); + } else { + double tmp_value; + convert_stream >> tmp_value; + value = static_cast(tmp_value); + } + } + } + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace(); + if (cpu_place) { + auto &dev_ctx = *pool.Get(platform::CPUPlace()); + phi::funcs::SetConstant functor; + out->mutable_data(platform::CPUPlace(), + framework::TransToPhiDataType(data_type)); + functor(reinterpret_cast(dev_ctx), + out, + static_cast(value)); + } else { + out->mutable_data(ctx.GetPlace(), + framework::TransToPhiDataType(data_type)); + const T *value_data = &value; + cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST; + MLUCnnlTensorDesc output_desc(*out); + MLUCnnl::Fill( + ctx, pointer_mode, value_data, output_desc.get(), GetBasePtr(out)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL( + fill_constant_batch_size_like, + ops::FillConstantBatchSizeLikeOpMLUKernel, + ops::FillConstantBatchSizeLikeOpMLUKernel, + ops::FillConstantBatchSizeLikeOpMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_batch_size_like_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_batch_size_like_op_mlu.py new file mode 100644 index 0000000000000..1e8275df0b5a7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_batch_size_like_op_mlu.py @@ -0,0 +1,219 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys + +sys.path.append("..") +import paddle +import paddle.fluid.core as core +from paddle.static import program_guard, Program +import paddle.compat as cpt +import unittest +import numpy as np +from op_test import OpTest +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + +paddle.enable_static() + + +def fill_constant_batch_size_like(input, + shape, + value, + data_type, + input_dim_idx=0, + output_dim_idx=0, + force_cpu=False): + return paddle.fluid.layers.fill_constant_batch_size_like( + input, shape, data_type, value, input_dim_idx, output_dim_idx, + force_cpu) + + +class TestFillConstantBatchSizeLike(OpTest): + + def setUp(self): + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + self.op_type = "fill_constant_batch_size_like" + self.init_shape() + self.init_value() + self.init_dtype() + self.init_force_cpu() + self.init_dim_idx() + + self.inputs = { + 'Input': np.random.random(self.input_shape).astype("float32") + } + self.attrs = { + 'shape': self.shape, + 'value': self.value, + 'str_value': self.str_value, + 'dtype': self.dtype, + 'force_cpu': self.force_cpu, + 'input_dim_idx': self.input_dim_idx, + 'output_dim_idx': self.output_dim_idx + } + self.outputs = { + 'Out': np.full(self.output_shape, self.output_value, + self.output_dtype) + } + + def init_shape(self): + self.input_shape = [4, 5] + self.shape = [123, 92] + self.output_shape = (4, 92) + + def init_value(self): + self.value = 3.8 + self.str_value = '' + self.output_value = 3.8 + + def init_dtype(self): + self.dtype = core.VarDesc.VarType.FP32 + self.output_dtype = np.float32 + + def init_force_cpu(self): + self.force_cpu = False + + def init_dim_idx(self): + self.input_dim_idx = 0 + self.output_dim_idx = 0 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestFillConstantBatchSizeLike2(TestFillConstantBatchSizeLike): + + def init_shape(self): + # test shape + self.input_shape = [4, 5, 6, 7] + self.shape = [10, 123, 92] + self.output_shape = (4, 123, 92) + + +class TestFillConstantBatchSizeLike3(TestFillConstantBatchSizeLike): + + def init_value(self): + # use 'str_value' rather than 'value' + self.value = 3.8 + self.str_value = '4.5' + self.output_value = 4.5 + + +class TestFillConstantBatchSizeLike4(TestFillConstantBatchSizeLike): + + def init_value(self): + # str_value = 'inf' + self.value = 3.8 + self.str_value = 'inf' + self.output_value = float('inf') + + +class TestFillConstantBatchSizeLike5(TestFillConstantBatchSizeLike): + + def init_value(self): + # str_value = '-inf' + self.value = 3.8 + self.str_value = '-inf' + self.output_value = -float('inf') + + +class TestFillConstantBatchSizeLike6(TestFillConstantBatchSizeLike): + + def init_dtype(self): + self.dtype = core.VarDesc.VarType.FP16 + self.output_dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-2) + + +class TestFillConstantBatchSizeLike7(TestFillConstantBatchSizeLike): + + def init_dtype(self): + self.dtype = core.VarDesc.VarType.INT32 + self.output_dtype = np.int32 + + +class TestFillConstantBatchSizeLike8(TestFillConstantBatchSizeLike): + + def init_force_cpu(self): + self.force_cpu = True + + +class TestFillConstantBatchSizeLike9(TestFillConstantBatchSizeLike): + + def init_shape(self): + self.input_shape = [4, 5] + self.shape = [123, 92] + self.output_shape = (123, 4) + + def init_dim_idx(self): + self.input_dim_idx = 0 + self.output_dim_idx = 1 + + +class TestFillConstantBatchSizeLikeLodTensor(TestFillConstantBatchSizeLike): + # test LodTensor + def setUp(self): + self.place = paddle.device.MLUPlace(0) + self.__class__.use_mlu = True + self.op_type = "fill_constant_batch_size_like" + self.init_shape() + self.init_value() + self.init_dtype() + self.init_force_cpu() + self.init_dim_idx() + + lod = [[3, 2, 5]] + self.inputs = { + 'Input': (np.random.random(self.input_shape).astype("float32"), lod) + } + self.attrs = { + 'shape': self.shape, + 'value': self.value, + 'str_value': self.str_value, + 'dtype': self.dtype, + 'force_cpu': self.force_cpu, + 'input_dim_idx': self.input_dim_idx, + 'output_dim_idx': self.output_dim_idx + } + self.outputs = { + 'Out': np.full(self.output_shape, self.output_value, + self.output_dtype) + } + + def init_shape(self): + self.input_shape = [10, 20] + self.shape = [123, 92] + self.output_shape = (3, 92) + + +class TestFillConstantBatchSizeLikeLodTensor2( + TestFillConstantBatchSizeLikeLodTensor): + # test LodTensor with 'input_dim_idx' != 0 + def init_shape(self): + self.input_shape = [10, 20] + self.shape = [123, 92] + self.output_shape = (20, 92) + + def init_dim_idx(self): + self.input_dim_idx = 1 + self.output_dim_idx = 0 + + +if __name__ == "__main__": + unittest.main() From ccb333c108ec82423b61f6827da34a734b382f00 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 1 Jul 2022 11:00:47 +0800 Subject: [PATCH 020/250] Fix core so name mismatch error (#43977) * fix core avx soname error * remove print info --- python/setup.py.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/setup.py.in b/python/setup.py.in index 567a411d0980b..624218c5caf67 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -578,7 +578,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': commands = ["install_name_tool -id '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'] commands.append("install_name_tool -add_rpath '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so') else: - commands = ["patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'] + commands = ["patchelf --set-soname '${FLUID_CORE_NAME}.so' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'] + commands.append("patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so') # The sw_64 not suppot patchelf, so we just disable that. if platform.machine() != 'sw_64' and platform.machine() != 'mips64': for command in commands: From 8571833fc7e00b37ccd417c57a94415210a82d6b Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 1 Jul 2022 11:01:54 +0800 Subject: [PATCH 021/250] [Dy2Stat]Enhance nonlocal machanism while returning single var (#43957) * [Dy2Stat]Enhance nonlocal machanism while returning single var * [Dy2Stat]Enhance nonlocal machanism while returning single var --- .../dygraph_to_static/convert_operators.py | 15 +++++---------- .../fluid/dygraph/dygraph_to_static/utils.py | 12 +++++------- .../dygraph_to_static/test_program_translator.py | 16 ++++++++-------- 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py index a6cab0db51380..c0c679e2e1ef0 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py @@ -50,18 +50,13 @@ def convert_while_loop(cond, body, getter, setter): def _run_paddle_while(cond, body, getter, setter): # NOTE: loop_vars of Paddle op `control_flow.while_loop` must be Paddle Tensors. - def to_list(x): - if isinstance(x, (tuple, list)): return x - return [x] # UndefinedVar will become data layer not check. - loop_vars = [to_static_variable(var) for var in to_list(getter())] - setter(loop_vars if len(loop_vars) > 1 else - loop_vars[0]) # change the non-local var to variable + loop_vars = [to_static_variable(var) for var in getter()] + setter(loop_vars) # change the non-local var to variable # variable maybe modified to inner var. change it into loop_vars = control_flow.while_loop(cond, body, loop_vars) - setter(loop_vars if len(loop_vars) > 1 else - loop_vars[0]) # change the non-local var to variable + setter(loop_vars) # change the non-local var to variable return loop_vars @@ -318,11 +313,11 @@ def _recover_args_state(outs, get_args, set_args, return_name_ids): init_args = get_args() # recover args state num_outs = len(return_name_ids) - num_args = 1 if not isinstance(init_args, tuple) else len(init_args) + num_args = len(init_args) assert num_outs <= num_args if num_args == 1: - final_outs = outs + final_outs = (outs, ) else: outs = (outs, ) if num_outs == 1 else outs final_outs = outs + init_args[num_outs:] diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 466e9ee4d34c1..b51635b85f945 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -38,6 +38,9 @@ PADDLE_MODULE_PREFIX = 'paddle.' DYGRAPH_MODULE_PREFIX = 'paddle.fluid.dygraph' DYGRAPH_TO_STATIC_MODULE_PREFIX = 'paddle.fluid.dygraph.dygraph_to_static' +GET_ARGS_FUNC_PREFIX = 'get_args' +SET_ARGS_FUNC_PREFIX = 'set_args' +ARGS_NAME = '__args' class BaseNodeVisitor(gast.NodeVisitor): @@ -1619,7 +1622,7 @@ def {func_name}(): template = """ def {func_name}(): nonlocal {nonlocal_vars} - return {vars} + return {vars}, """ func_def = template.format( func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX), @@ -1628,11 +1631,6 @@ def {func_name}(): return gast.parse(textwrap.dedent(func_def)).body[0] -GET_ARGS_FUNC_PREFIX = 'get_args' -SET_ARGS_FUNC_PREFIX = 'set_args' -ARGS_NAME = '__args' - - def create_set_args_node(names): """ Create set_args function as follows: @@ -1661,7 +1659,7 @@ def {func_name}({args}): template = """ def {func_name}({args}): nonlocal {nonlocal_vars} - {vars} = {args} + {vars}, = {args} """ func_def = template.format( func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX), diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py index 41968278f7bc0..8d2665129e94e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py @@ -74,11 +74,11 @@ def dyfunc_with_if_else(x_v, label=None): def get_args_0(): nonlocal x_v - return x_v + return x_v, def set_args_0(__args): nonlocal x_v - x_v = __args + x_v, = __args def true_fn_0(): nonlocal x_v @@ -96,11 +96,11 @@ def false_fn_0(): def get_args_1(): nonlocal __return_value_0, label, x_v - return __return_value_0, label, x_v + return __return_value_0, label, x_v, def set_args_1(__args): nonlocal __return_value_0, label, x_v - __return_value_0, label, x_v = __args + __return_value_0, label, x_v, = __args def true_fn_1(): nonlocal __return_value_0, label, x_v @@ -131,11 +131,11 @@ def dyfunc_with_if_else(x_v, label=None): def get_args_2(): nonlocal x_v - return x_v + return x_v, def set_args_2(__args): nonlocal x_v - x_v = __args + x_v, = __args def true_fn_2(): nonlocal x_v @@ -153,11 +153,11 @@ def false_fn_2(): def get_args_3(): nonlocal __return_value_1, label, x_v - return __return_value_1, label, x_v + return __return_value_1, label, x_v, def set_args_3(__args): nonlocal __return_value_1, label, x_v - __return_value_1, label, x_v = __args + __return_value_1, label, x_v, = __args def true_fn_3(): nonlocal __return_value_1, label, x_v From 267d3191a9afcafcaf99ad51d79def7a26257bd6 Mon Sep 17 00:00:00 2001 From: enzodechine Date: Fri, 1 Jul 2022 12:16:08 +0800 Subject: [PATCH 022/250] Re-write the unit tests for compare xpu op (#43460) * re-write the unit tests for compare xpu op *test=kunlun * re-write the unit tests for compare xpu op *test=kunlun Co-authored-by: runzhech --- .../unittests/xpu/test_compare_op_xpu.py | 651 ++++++++++-------- 1 file changed, 381 insertions(+), 270 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py index f33da83bae7a1..cdaf767a1de68 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py @@ -12,282 +12,393 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function - import sys sys.path.append("..") import unittest import numpy as np -import paddle.fluid.core as core -import paddle.fluid as fluid -from op_test_xpu import OpTest, XPUOpTest +from op_test_xpu import XPUOpTest import paddle -from paddle.fluid import Program, program_guard - - -def create_test_class(op_type, typename, callback): - - class Cls(OpTest): - - def setUp(self): - a = np.random.random(size=(10, 7)).astype(typename) - b = np.random.random(size=(10, 7)).astype(typename) - c = callback(a, b) - self.inputs = {'X': a, 'Y': b} - self.outputs = {'Out': c} - self.op_type = op_type - self.use_xpu = True - self.attrs = {'use_xpu': True} - - def test_check_output(self): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_errors(self): - paddle.enable_static() - with program_guard(Program(), Program()): - x = fluid.layers.data(name='x', shape=[2], dtype='int32') - y = fluid.layers.data(name='y', shape=[2], dtype='int32') - a = fluid.layers.data(name='a', shape=[2], dtype='int16') - if self.op_type == "less_than": - self.assertRaises(TypeError, - fluid.layers.less_than, - x=x, - y=y, - force_cpu=1) - op = eval("fluid.layers.%s" % self.op_type) - self.assertRaises(TypeError, op, x=x, y=y, cond=1) - self.assertRaises(TypeError, op, x=x, y=a) - self.assertRaises(TypeError, op, x=a, y=y) - - cls_name = "{0}_{1}".format(op_type, typename) - Cls.__name__ = cls_name - globals()[cls_name] = Cls - - -for _type_name in {'int32'}: - if _type_name == 'float64' and core.is_compiled_with_rocm(): - _type_name = 'float32' - - create_test_class('less_than', _type_name, lambda _a, _b: _a < _b) - create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b) - create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b) - create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b) - create_test_class('equal', _type_name, lambda _a, _b: _a == _b) - create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b) - - -def create_paddle_case(op_type, callback): - - class PaddleCls(unittest.TestCase): - - def setUp(self): - self.op_type = op_type - self.input_x = np.array([1, 2, 3, 4]).astype(np.int64) - self.input_y = np.array([1, 3, 2, 4]).astype(np.int64) - self.real_result = callback(self.input_x, self.input_y) - self.place = fluid.XPUPlace( - 0) if fluid.core.is_compiled_with_xpu() else fluid.CPUPlace() - - def test_api(self): - paddle.enable_static() - with program_guard(Program(), Program()): - x = fluid.data(name='x', shape=[4], dtype='int64') - y = fluid.data(name='y', shape=[4], dtype='int64') - op = eval("paddle.%s" % (self.op_type)) - out = op(x, y) - exe = fluid.Executor(self.place) - res, = exe.run(feed={ - "x": self.input_x, - "y": self.input_y - }, - fetch_list=[out]) - self.assertEqual((res == self.real_result).all(), True) - - def test_api_float(self): - if self.op_type == "equal": - paddle.enable_static() - with program_guard(Program(), Program()): - x = fluid.data(name='x', shape=[4], dtype='int64') - y = fluid.data(name='y', shape=[1], dtype='int64') - op = eval("paddle.%s" % (self.op_type)) - out = op(x, y) - exe = fluid.Executor(self.place) - res, = exe.run(feed={ - "x": self.input_x, - "y": 1.0 - }, - fetch_list=[out]) - self.real_result = np.array([1, 0, 0, 0]).astype(np.int64) - self.assertEqual((res == self.real_result).all(), True) - - def test_dynamic_api(self): - paddle.disable_static() - x = paddle.to_tensor(self.input_x) - y = paddle.to_tensor(self.input_y) - op = eval("paddle.%s" % (self.op_type)) - out = op(x, y) - self.assertEqual((out.numpy() == self.real_result).all(), True) - paddle.enable_static() - - def test_dynamic_api_int(self): - if self.op_type == "equal": - paddle.disable_static() - x = paddle.to_tensor(self.input_x) - op = eval("paddle.%s" % (self.op_type)) - out = op(x, 1) - self.real_result = np.array([1, 0, 0, 0]).astype(np.int64) - self.assertEqual((out.numpy() == self.real_result).all(), True) - paddle.enable_static() - - def test_dynamic_api_float(self): - if self.op_type == "equal": - paddle.disable_static() - x = paddle.to_tensor(self.input_x) - op = eval("paddle.%s" % (self.op_type)) - out = op(x, 1.0) - self.real_result = np.array([1, 0, 0, 0]).astype(np.int64) - self.assertEqual((out.numpy() == self.real_result).all(), True) - paddle.enable_static() - - def test_assert(self): - - def test_dynamic_api_string(self): - if self.op_type == "equal": - paddle.disable_static() - x = paddle.to_tensor(self.input_x) - op = eval("paddle.%s" % (self.op_type)) - out = op(x, "1.0") - paddle.enable_static() - - self.assertRaises(TypeError, test_dynamic_api_string) - - def test_dynamic_api_bool(self): - if self.op_type == "equal": - paddle.disable_static() - x = paddle.to_tensor(self.input_x) - op = eval("paddle.%s" % (self.op_type)) - out = op(x, True) - self.real_result = np.array([1, 0, 0, 0]).astype(np.int64) - self.assertEqual((out.numpy() == self.real_result).all(), True) - paddle.enable_static() - - def test_broadcast_api_1(self): - paddle.enable_static() - with program_guard(Program(), Program()): - x = paddle.static.data(name='x', - shape=[1, 2, 1, 3], - dtype='int32') - y = paddle.static.data(name='y', shape=[1, 2, 3], dtype='int32') - op = eval("paddle.%s" % (self.op_type)) - out = op(x, y) - exe = paddle.static.Executor(self.place) - input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32) - input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32) - real_result = callback(input_x, input_y) - res, = exe.run(feed={ - "x": input_x, - "y": input_y - }, - fetch_list=[out]) - self.assertEqual((res == real_result).all(), True) - - def test_broadcast_api_2(self): - paddle.enable_static() - with program_guard(Program(), Program()): - x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32') - y = paddle.static.data(name='y', - shape=[1, 2, 1, 3], - dtype='int32') - op = eval("paddle.%s" % (self.op_type)) - out = op(x, y) - exe = paddle.static.Executor(self.place) - input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32) - input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32) - real_result = callback(input_x, input_y) - res, = exe.run(feed={ - "x": input_x, - "y": input_y - }, - fetch_list=[out]) - self.assertEqual((res == real_result).all(), True) - - def test_broadcast_api_3(self): - paddle.enable_static() - with program_guard(Program(), Program()): - x = paddle.static.data(name='x', shape=[5], dtype='int32') - y = paddle.static.data(name='y', shape=[3, 1], dtype='int32') - op = eval("paddle.%s" % (self.op_type)) - out = op(x, y) - exe = paddle.static.Executor(self.place) - input_x = np.arange(0, 5).reshape((5)).astype(np.int32) - input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(np.int32) - real_result = callback(input_x, input_y) - res, = exe.run(feed={ - "x": input_x, - "y": input_y - }, - fetch_list=[out]) - self.assertEqual((res == real_result).all(), True) - - def test_bool_api_4(self): - paddle.enable_static() - with program_guard(Program(), Program()): - x = paddle.static.data(name='x', shape=[3, 1], dtype='bool') - y = paddle.static.data(name='y', shape=[3, 1], dtype='bool') - op = eval("paddle.%s" % (self.op_type)) - out = op(x, y) - exe = paddle.static.Executor(self.place) - input_x = np.array([True, False, True]).astype(np.bool_) - input_y = np.array([True, True, False]).astype(np.bool_) - real_result = callback(input_x, input_y) - res, = exe.run(feed={ - "x": input_x, - "y": input_y - }, - fetch_list=[out]) - self.assertEqual((res == real_result).all(), True) - - def test_bool_broadcast_api_4(self): - paddle.enable_static() - with program_guard(Program(), Program()): - x = paddle.static.data(name='x', shape=[3, 1], dtype='bool') - y = paddle.static.data(name='y', shape=[1], dtype='bool') - op = eval("paddle.%s" % (self.op_type)) - out = op(x, y) - exe = paddle.static.Executor(self.place) - input_x = np.array([True, False, True]).astype(np.bool_) - input_y = np.array([True]).astype(np.bool_) - real_result = callback(input_x, input_y) - res, = exe.run(feed={ - "x": input_x, - "y": input_y - }, - fetch_list=[out]) - self.assertEqual((res == real_result).all(), True) - - def test_attr_name(self): - paddle.enable_static() - with program_guard(Program(), Program()): - x = fluid.layers.data(name='x', shape=[4], dtype='int32') - y = fluid.layers.data(name='y', shape=[4], dtype='int32') - op = eval("paddle.%s" % (self.op_type)) - out = op(x=x, y=y, name="name_%s" % (self.op_type)) - self.assertEqual("name_%s" % (self.op_type) in out.name, True) - - cls_name = "TestCase_{}".format(op_type) - PaddleCls.__name__ = cls_name - globals()[cls_name] = PaddleCls - - -create_paddle_case('less_than', lambda _a, _b: _a < _b) -create_paddle_case('less_equal', lambda _a, _b: _a <= _b) -create_paddle_case('greater_than', lambda _a, _b: _a > _b) -create_paddle_case('greater_equal', lambda _a, _b: _a >= _b) -create_paddle_case('equal', lambda _a, _b: _a == _b) -create_paddle_case('not_equal', lambda _a, _b: _a != _b) +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types +from xpu.get_test_cover_info import XPUOpTestWrapper + + +class TestCompareOpBase(XPUOpTest): + + def setUp(self): + self.place = paddle.XPUPlace(0) + self.config() + self.set_case() + self.inputs = {'X': self.x, 'Y': self.y} + self.outputs = {'Out': self.result} + + def set_case(self): + self.x = np.random.uniform(self.lbound, self.hbound, + self.x_shape).astype(self.dtype) + self.y = np.random.uniform(self.lbound, self.hbound, + self.y_shape).astype(self.dtype) + self.result = self.compute(self.x, self.y) + + def config(self): + self.dtype = np.float32 + self.op_type = 'less_than' + self.compute = np.less + self.lbound = -100 + self.hbound = 100 + self.x_shape = [11, 17] + self.y_shape = [11, 17] + + def test_check_output(self): + paddle.enable_static() + self.check_output_with_place(self.place) + + +class XPUTestLessThanOP(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'less_than' + self.use_dynamic_create_class = False + + class LessThanOpTestCase1(TestCompareOpBase): + + def config(self): + self.dtype = self.in_type + self.op_type = 'less_than' + self.compute = np.less + self.set_data() + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [11, 17] + self.y_shape = [11, 17] + + class LessThanOpTestCase2(LessThanOpTestCase1): + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [11, 17] + self.y_shape = [1] + + class LessThanOpTestCase3(LessThanOpTestCase1): + + def set_data(self): + self.lbound = -300 + self.hbound = 300 + self.x_shape = [11, 17, 29] + self.y_shape = [1] + + class LessThanOpTestCase4(LessThanOpTestCase1): + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [128, 128, 512] + self.y_shape = [1] + + class LessThanOpTestCase5(LessThanOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [128, 128, 512] + self.y_shape = [128, 128, 512] + + +support_types = get_xpu_op_support_types('less_than') +for stype in support_types: + create_test_class(globals(), XPUTestLessThanOP, stype) + + +class XPUTestLessEqualOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'less_equal' + self.use_dynamic_create_class = False + + class LessEqualOpTestCase1(TestCompareOpBase): + + def config(self): + self.dtype = self.in_type + self.op_type = 'less_equal' + self.compute = np.less_equal + self.set_data() + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [11, 17] + self.y_shape = [11, 17] + + class LessEqualOpTestCase2(LessEqualOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [11, 17, 255] + self.y_shape = [11, 17, 255] + + class LessEqualOpTestCase3(LessEqualOpTestCase1): + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [11, 17, 255] + self.y_shape = [1] + + class LessEqualOpTestCase4(LessEqualOpTestCase1): + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [11, 17] + self.y_shape = [1] + + class LessEqualOpTestCase5(LessEqualOpTestCase1): + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [128, 128, 512] + self.y_shape = [128, 128, 512] + + +support_types = get_xpu_op_support_types('less_equal') +for stype in support_types: + create_test_class(globals(), XPUTestLessEqualOp, stype) + + +class XPUTestGreaterThanOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'greater_than' + self.use_dynamic_create_class = False + + class GreaterThanOpTestCase1(TestCompareOpBase): + + def config(self): + self.dtype = self.in_type + self.op_type = 'greater_than' + self.compute = np.greater + self.set_data() + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [128, 128, 512] + self.y_shape = [128, 128, 512] + + class GreaterThanOpTestCase2(GreaterThanOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [128, 128, 512] + self.y_shape = [1] + + class GreaterThanOpTestCase3(GreaterThanOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [11, 17] + self.y_shape = [1] + + class GreaterThanOpTestCase4(GreaterThanOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [11, 17] + self.y_shape = [11, 17] + + class GreaterThanOpTestCase5(GreaterThanOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [10, 10, 20, 20] + self.y_shape = [10, 10, 20, 20] + + +support_types = get_xpu_op_support_types('greater_than') +for stype in support_types: + create_test_class(globals(), XPUTestGreaterThanOp, stype) + + +class XPUTestGreaterEqualOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'greater_equal' + self.use_dynamic_create_class = False + + class GreaterEqualOpTestCase1(TestCompareOpBase): + + def config(self): + self.dtype = self.in_type + self.op_type = 'greater_equal' + self.compute = np.greater_equal + self.set_data() + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [10, 10, 20, 20] + self.y_shape = [10, 10, 20, 20] + + class GreaterEqualOpTestCase2(GreaterEqualOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [10, 10] + self.y_shape = [10, 10] + + class GreaterEqualOpTestCase3(GreaterEqualOpTestCase1): + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [512, 512, 2] + self.y_shape = [1] + + class GreaterEqualOpTestCase4(GreaterEqualOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [10, 10, 20, 20] + self.y_shape = [1] + + class GreaterEqualOpTestCase5(GreaterEqualOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [10, 30, 15] + self.y_shape = [10, 30, 15] + + +support_types = get_xpu_op_support_types('greater_equal') +for stype in support_types: + create_test_class(globals(), XPUTestGreaterEqualOp, stype) + + +class XPUTestEqualOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'equal' + self.use_dynamic_create_class = False + + class EqualOpTestCase1(TestCompareOpBase): + + def config(self): + self.dtype = self.in_type + self.op_type = 'equal' + self.compute = np.equal + self.set_data() + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [10, 30, 15] + self.y_shape = [10, 30, 15] + + class EqualOpTestCase2(EqualOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [10, 30, 15] + self.y_shape = [1] + + class EqualOpTestCase3(EqualOpTestCase1): + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [10, 30] + self.y_shape = [10, 30] + + class EqualOpTestCase4(EqualOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [256, 256, 10] + self.y_shape = [256, 256, 10] + + class EqualOpTestCase5(EqualOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [11, 17] + self.y_shape = [1] + + +support_types = get_xpu_op_support_types('equal') +for stype in support_types: + create_test_class(globals(), XPUTestEqualOp, stype) + + +class XPUTestNotEqualOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'not_equal' + self.use_dynamic_create_class = False + + class NotEqualOpTestCase1(TestCompareOpBase): + + def config(self): + self.dtype = self.in_type + self.op_type = 'not_equal' + self.compute = np.not_equal + self.set_data() + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [11, 17] + self.y_shape = [1] + + class NotEqualOpTestCase2(NotEqualOpTestCase1): + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [11, 17] + self.y_shape = [11, 17] + + class NotEqualOpTestCase3(NotEqualOpTestCase1): + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [11, 17, 30] + self.y_shape = [1] + + class NotEqualOpTestCase4(NotEqualOpTestCase1): + + def set_data(self): + self.lbound = -200 + self.hbound = 200 + self.x_shape = [256, 256, 10] + self.y_shape = [256, 256, 10] + + class NotEqualOpTestCase5(NotEqualOpTestCase1): + + def set_data(self): + self.lbound = -100 + self.hbound = 100 + self.x_shape = [512, 128] + self.y_shape = [512, 128] + + +support_types = get_xpu_op_support_types('not_equal') +for stype in support_types: + create_test_class(globals(), XPUTestNotEqualOp, stype) if __name__ == '__main__': unittest.main() From 76156d12625037ee836aaeeb389f3edb4a5b6a5b Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Fri, 1 Jul 2022 13:18:35 +0800 Subject: [PATCH 023/250] [inference TRT]template GetWeightCPUData (#43993) * template GetWeightCPUData --- paddle/fluid/inference/tensorrt/engine.cc | 39 ++++++++++++----------- paddle/fluid/inference/tensorrt/engine.h | 16 +++++++--- 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 82c51311a03d5..9fe8f67e6a657 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -390,33 +390,36 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { return itensor_map_[name]; } +std::unordered_map + *TensorRTEngine::GetITensorMap() { + return &itensor_map_; +} + void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { runtime_batch_ = batch_size; } -float *TensorRTEngine::GetWeightCPUData(const std::string &name, - framework::Tensor *weight_tensor) { - static int name_suffix_counter = 0; - std::string name_suffix = std::to_string(name_suffix_counter); - std::string splitter = "__"; - std::string name_with_suffix = name + splitter + name_suffix; +template +T *TensorRTEngine::GetWeightCPUData(const std::string &name, + framework::Tensor *weight_tensor) { + std::unique_ptr cpu_weight_tensor(new framework::Tensor()); platform::CPUPlace cpu_place; - PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), - 0, - platform::errors::AlreadyExists( - "The weight named %s is set into the weight map " - "twice in TRT OP converter.", - name_with_suffix)); - weight_map[name_with_suffix].reset(new framework::Tensor()); - weight_map[name_with_suffix]->Resize(weight_tensor->dims()); + cpu_weight_tensor->Resize(weight_tensor->dims()); paddle::framework::TensorCopySync( - *weight_tensor, cpu_place, weight_map[name_with_suffix].get()); - float *weight_data = - weight_map[name_with_suffix]->mutable_data(cpu_place); - name_suffix_counter += 1; + *weight_tensor, cpu_place, cpu_weight_tensor.get()); + T *weight_data = cpu_weight_tensor->mutable_data(cpu_place); + SetWeights(name, std::move(cpu_weight_tensor)); return weight_data; } +template float *TensorRTEngine::GetWeightCPUData( + const std::string &name, framework::Tensor *weight_tensor); +template int32_t *TensorRTEngine::GetWeightCPUData( + const std::string &name, framework::Tensor *weight_tensor); + +template int64_t *TensorRTEngine::GetWeightCPUData( + const std::string &name, framework::Tensor *weight_tensor); + int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } nvinfer1::IPluginV2Layer *TensorRTEngine::AddPlugin( diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 8d28d1c05ea14..c75f7dd17cb95 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -268,6 +268,7 @@ class TensorRTEngine { void SetITensor(const std::string& name, nvinfer1::ITensor* tensor); // Get an ITensor called name. nvinfer1::ITensor* GetITensor(const std::string& name); + std::unordered_map* GetITensorMap(); nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::IExecutionContext* context() { @@ -405,9 +406,9 @@ class TensorRTEngine { void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) { quant_dynamic_range_[tensor] = range; } - - float* GetWeightCPUData(const std::string& name, - framework::Tensor* weight_tensor); + template + T* GetWeightCPUData(const std::string& name, + framework::Tensor* weight_tensor); // A pointer to CPU memory is needed of the TRT weight. // Before TRT runs, fluid loads weight into GPU storage. @@ -424,7 +425,14 @@ class TensorRTEngine { static int suffix_counter = 0; std::string suffix = std::to_string(suffix_counter); std::string splitter = "__"; - weight_map[w_name + splitter + suffix] = std::move(w_tensor); + std::string name_with_suffix = w_name + splitter + suffix; + PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), + 0, + platform::errors::AlreadyExists( + "The weight named %s is set into the weight map " + "twice in TRT OP converter.", + name_with_suffix)); + weight_map[name_with_suffix] = std::move(w_tensor); suffix_counter += 1; } From cf8d42bb03fb2c3f69f10bab2d898b99115ac0ea Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 1 Jul 2022 14:03:53 +0800 Subject: [PATCH 024/250] [Dy2Stat]Polish break/continue statement transformer logic (#43489) * [Dy2Stat]Polish break/continue statement transformer logic --- .../dygraph_to_static/break_continue_transformer.py | 8 ++++---- .../dygraph/dygraph_to_static/variable_trans_func.py | 9 +++++++++ .../unittests/dygraph_to_static/test_break_continue.py | 6 +++++- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py index b85a2137dad81..7bce234168c7e 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py @@ -20,7 +20,7 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import index_in_list from paddle.fluid.dygraph.dygraph_to_static.utils import ForNodeVisitor from paddle.fluid.dygraph.dygraph_to_static.utils import BaseNodeVisitor -from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node +from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_bool_node __all__ = ['BreakContinueTransformer'] @@ -140,7 +140,7 @@ def visit_Break(self, node): self._replace_if_stmt(loop_node_index, first_block_index, variable_name) # 4. For 'break' add break into condition of the loop. - assign_false_node = create_fill_constant_node(variable_name, False) + assign_false_node = create_bool_node(variable_name, False) self._add_stmt_before_cur_node(loop_node_index, assign_false_node) cond_var_node = gast.UnaryOp(op=gast.Not(), @@ -177,7 +177,7 @@ def visit_Continue(self, node): self._replace_if_stmt(loop_node_index, first_block_index, variable_name) # 4. For 'continue', set continue to False at the beginning of each loop - assign_false_node = create_fill_constant_node(variable_name, False) + assign_false_node = create_bool_node(variable_name, False) loop_node.body.insert(0, assign_false_node) def _remove_stmts_after_break_continue(self, break_continue_node, @@ -221,7 +221,7 @@ def _replace_break_continue_in_stmt_list(self, stmt_list, i = index_in_list(stmt_list, break_continue_node) if i == -1: return False - assign_true_node = create_fill_constant_node(break_continue_name, True) + assign_true_node = create_bool_node(break_continue_name, True) stmt_list[i:] = [assign_true_node] return True diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py index 9bbce59fc54ce..28d7cff8cb0ca 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py @@ -76,3 +76,12 @@ def create_bool_as_type(x, value=True): return paddle.full(shape=[1], fill_value=value, dtype="bool") else: return value + + +def create_bool_node(name, value): + ''' + Create a assign stmt for name = value . + ''' + assert isinstance(value, bool) + node = "{} = {}".format(name, value) + return gast.parse(node).body[0] diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py index 79b6880b0d871..6b4b2d46a12f6 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py @@ -101,7 +101,11 @@ def test_break_continue_in_for(x): x += 10086 a = fluid.layers.fill_constant(shape=[1], dtype='int32', value=0) - for i in range(1, 10, 1): + b = fluid.layers.fill_constant(shape=[1], dtype='int32', value=3) + # b = 10 + # TODO: add Raise Error and suggestion for usage: + # Py for contains break/continue depends on control-flow. + for i in range(b): if a <= 4: x += 1 a += 1 From fac6a5f015babb5979d3f9cb38767d90f19a7fc3 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:06:05 +0800 Subject: [PATCH 025/250] Process sub-node in tensor_shape_transformer (#43998) --- .../fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py index b7a2087d1f24d..5604a634a171b 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py @@ -37,6 +37,7 @@ def transform(self): self.visit(self.root) def visit_Attribute(self, node): + self.generic_visit(node) if node.attr == 'shape': args = ast_to_source_code(node.value).strip() # NOTE(dev): we can deal with paddle.shape in this case, but it's From 53d5abe37224658b637c9db5200e7b4e0b7f949b Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Fri, 1 Jul 2022 14:23:21 +0800 Subject: [PATCH 026/250] Addition of switch_auto_tune option for transpose op (#43310) * 2nd part of transpose update * add switch_auto_tune option. * add some changes according to Ci * refine the structure of auto_tune_base. * merge develop changes * reset the switch_set_range and change unittest of transpose auto-tune * change the kernel auto-tune logits --- paddle/fluid/operators/fused/fmha_ref.h | 17 ++--- .../operators/fused/fused_gate_attention.h | 28 +++---- paddle/fluid/operators/transpose_op.cu.h | 62 +++++---------- paddle/phi/kernels/autotune/auto_tune_base.h | 76 ++++++++++++------- paddle/phi/kernels/autotune/auto_tune_test.cu | 19 ----- paddle/phi/kernels/autotune/cache.cc | 7 ++ paddle/phi/kernels/autotune/cache.h | 4 + .../phi/kernels/autotune/switch_autotune.cc | 1 + paddle/phi/kernels/gpu/transpose_kernel.cu | 3 +- .../tests/unittests/test_transpose_op.py | 35 +++++++++ 10 files changed, 131 insertions(+), 121 deletions(-) diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index 3ac5718917346..ef1befbb32033 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -97,10 +97,9 @@ class FMHARef { // input shape: [bs, seq_len, 3, num_head, head_dim] // transpose with perm [2, 0, 3, 1, 4], // output_shape: [3, bs, num_head, seq_len, head_dim] - int ndims = 5; std::vector perm_1 = {2, 0, 3, 1, 4}; TransposeGPUKernelDriver( - dev_ctx_, ndims, qkv_input_tensor, perm_1, transpose_2_out_tensor); + dev_ctx_, qkv_input_tensor, perm_1, transpose_2_out_tensor); T* qkv_data = transpose_2_out_tensor->data(); T* qk_out_data = qk_out_tensor->data(); T* qktv_out_data = qktv_out_tensor->data(); @@ -255,9 +254,8 @@ class FMHARef { // transpose: [0, 2, 1, 3] // output shape: [batch_size, seq_len, num_heads, head_dim] std::vector perm_3 = {0, 2, 1, 3}; - ndims = 4; TransposeGPUKernelDriver( - dev_ctx_, ndims, *qktv_out_tensor, perm_3, fmha_out_tensor); + dev_ctx_, *qktv_out_tensor, perm_3, fmha_out_tensor); } void ComputeBackward(const Tensor& transpose_2_out_tensor, @@ -297,10 +295,9 @@ class FMHARef { T* qktv_out_grad_data = qktv_out_grad_tensor->data(); // transpose bw - int ndims = 4; std::vector perm_3 = {0, 2, 1, 3}; TransposeGPUKernelDriver( - dev_ctx_, ndims, fmha_out_grad_tensor, perm_3, qktv_out_grad_tensor); + dev_ctx_, fmha_out_grad_tensor, perm_3, qktv_out_grad_tensor); // recall batchedgemm(nn) fw: softmax_out_data(x) * v_ptr(y) = // qktv_out_data(out) @@ -476,13 +473,9 @@ class FMHARef { stride_b); // transpose bw - ndims = 5; std::vector perm_1 = {1, 3, 0, 2, 4}; - TransposeGPUKernelDriver(dev_ctx_, - ndims, - *transpose_2_out_grad_tensor, - perm_1, - qkv_input_grad_tensor); + TransposeGPUKernelDriver( + dev_ctx_, *transpose_2_out_grad_tensor, perm_1, qkv_input_grad_tensor); } private: diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index 2dd923bd64d19..45d47908b99e0 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -622,11 +622,10 @@ class FMHAGateRef { Tensor* q_transpose_out, Tensor* k_transpose_out, Tensor* v_transpose_out) { - int ndims = 5; std::vector perm = {0, 1, 3, 2, 4}; - TransposeGPUKernelDriver(dev_ctx_, ndims, q_out, perm, q_transpose_out); - TransposeGPUKernelDriver(dev_ctx_, ndims, k_out, perm, k_transpose_out); - TransposeGPUKernelDriver(dev_ctx_, ndims, v_out, perm, v_transpose_out); + TransposeGPUKernelDriver(dev_ctx_, q_out, perm, q_transpose_out); + TransposeGPUKernelDriver(dev_ctx_, k_out, perm, k_transpose_out); + TransposeGPUKernelDriver(dev_ctx_, v_out, perm, v_transpose_out); } void ComputeQKVTransposeBackward(const Tensor& q_transpose_out_grad, @@ -635,48 +634,41 @@ class FMHAGateRef { Tensor* q_out_grad, Tensor* k_out_grad, Tensor* v_out_grad) { - int ndims = 5; std::vector perm = {0, 1, 3, 2, 4}; TransposeGPUKernelDriver( - dev_ctx_, ndims, q_transpose_out_grad, perm, q_out_grad); + dev_ctx_, q_transpose_out_grad, perm, q_out_grad); TransposeGPUKernelDriver( - dev_ctx_, ndims, k_transpose_out_grad, perm, k_out_grad); + dev_ctx_, k_transpose_out_grad, perm, k_out_grad); TransposeGPUKernelDriver( - dev_ctx_, ndims, v_transpose_out_grad, perm, v_out_grad); + dev_ctx_, v_transpose_out_grad, perm, v_out_grad); } // [batch_size, seq_len_m, seq_len_r, 3, num_heads, head_dim] -> // [3, batch_size, seq_len_m, num_heads, seq_len_r, head_dim] void ComputeQKVTransposeForward(const Tensor& qkv_out, Tensor* qkv_transpose_out) { - int ndims = 6; std::vector perm = {3, 0, 1, 4, 2, 5}; - TransposeGPUKernelDriver( - dev_ctx_, ndims, qkv_out, perm, qkv_transpose_out); + TransposeGPUKernelDriver(dev_ctx_, qkv_out, perm, qkv_transpose_out); } void ComputeQKVTransposeBackward(const Tensor& qkv_transpose_out_grad, Tensor* qkv_out_grad) { - int ndims = 6; std::vector perm = {1, 2, 4, 0, 3, 5}; TransposeGPUKernelDriver( - dev_ctx_, ndims, qkv_transpose_out_grad, perm, qkv_out_grad); + dev_ctx_, qkv_transpose_out_grad, perm, qkv_out_grad); } // [batch_size, seq_len_m, num_head, seq_len_r, c] -> // [batch_size, seq_len_m, seq_len_r, num_head, c] void ComputeQKTVTransposeForward(const Tensor& qktv_out, Tensor* fmha_out) { - int ndims = 5; std::vector perm = {0, 1, 3, 2, 4}; - TransposeGPUKernelDriver(dev_ctx_, ndims, qktv_out, perm, fmha_out); + TransposeGPUKernelDriver(dev_ctx_, qktv_out, perm, fmha_out); } void ComputeQKTVTransposeBackward(const Tensor& fmha_out_grad, Tensor* qktv_out_grad) { - int ndims = 5; std::vector perm = {0, 1, 3, 2, 4}; - TransposeGPUKernelDriver( - dev_ctx_, ndims, fmha_out_grad, perm, qktv_out_grad); + TransposeGPUKernelDriver(dev_ctx_, fmha_out_grad, perm, qktv_out_grad); } // qk_out = qk_out + nonbatched_bias + src_mask diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h index 1b90ad2c31384..0ae020c0dfd3c 100644 --- a/paddle/fluid/operators/transpose_op.cu.h +++ b/paddle/fluid/operators/transpose_op.cu.h @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/autotune/auto_tune_base.h" -#include "paddle/phi/kernels/autotune/cache.h" namespace paddle { namespace operators { @@ -1155,50 +1154,31 @@ inline void SimplifyThenLaunch(const int rank, } template -size_t GetTransposeKey(const int rank, - const Tensor& in, - const std::vector& perm) { - auto in_shape = phi::vectorize(in.dims()); - return phi::autotune::GetKey( - in_shape, perm, rank, paddle::experimental::CppTypeToDataType::Type()); -} - -template -void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, - const int rank, +void TransposeGPUKernelDriver(const phi::GPUContext& ctx, const Tensor& in, const std::vector& perm, Tensor* out) { - PADDLE_ENFORCE_LT( - rank, - phi::DDim::kMaxRank, - platform::errors::OutOfRange( - "The maximum dimension rank of " - "tensor is expected to be less than %d, but here is %d.", - phi::DDim::kMaxRank, - rank)); - - auto ret = TransposeSimple::run(dev_ctx, in, perm, out); + const int rank = perm.size(); + auto ret = TransposeSimple::run(ctx, in, perm, out); if (!ret) { - auto* tuner = phi::autotune::MakeTransposeTuner( - SimplifyThenLaunch); - if (!tuner->IsInit()) { - tuner->AddCallBack( - phi::autotune::MakeCallback(TransCompute)); - tuner->Finalize(); - } - - auto key = GetTransposeKey(rank, in, perm); - auto& cache = phi::autotune::AutoTuneCache::Instance().GetTranspose(); - if (cache.Find(key)) { - auto index = cache.Get(key); - tuner->RunBestKernel(index, rank, dev_ctx, in, out, perm); - } else { - // All avaliable kernels have ran while picking the best kernel, so - // there may be no need for another RunBestKernel. - auto index = tuner->PickBestKernel(dev_ctx, rank, dev_ctx, in, out, perm); - cache.Set(key, index); - } + auto* tuner = + phi::autotune::MakeTransposeTuner(TransCompute); + tuner->AddCallBack( + phi::autotune::MakeCallback(SimplifyThenLaunch)); + + size_t key = phi::autotune::TransposeKey( + phi::vectorize(in.dims()), + perm, + paddle::experimental::CppTypeToDataType::Type()); + + tuner->Run(ctx, + phi::autotune::AlgorithmType::kTranspose, + key, + rank, + ctx, + in, + out, + perm); } } diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h index 95afa7f697b49..91685c2ed547c 100644 --- a/paddle/phi/kernels/autotune/auto_tune_base.h +++ b/paddle/phi/kernels/autotune/auto_tune_base.h @@ -14,12 +14,10 @@ #pragma once -#include #include - #include "glog/logging.h" -#include "paddle/phi/core/enforce.h" #include "paddle/phi/kernels/autotune/gpu_timer.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" namespace phi { namespace autotune { @@ -51,33 +49,61 @@ class AutoTuneBase { public: AutoTuneBase() {} virtual ~AutoTuneBase() {} - explicit AutoTuneBase(KernelType kernel) { kernels_.push_back(kernel); } - template - void AddCallBack(Type kernel) { - static_assert(std::is_same::value, - "Type must be the same"); - kernels_.push_back(kernel); + explicit AutoTuneBase(KernelType kernel) { + kernels_.push_back(/*default=*/kernel); } - template - void RunBestKernel(const int idx, Args&&... args) { - kernels_[idx].Run(args...); + void AddCallBack(KernelType kernel) { + if (!is_init_) { + std::lock_guard lock(mutex_); + kernels_.push_back(kernel); + } } - template - void RunDefaultKernel(Args&&... args) { - kernels_[0].Run(args...); + template + void Run(const Context& ctx, + const AlgorithmType& algo, + const size_t key, + Args&&... args) { + PADDLE_ENFORCE_GT( + kernels_.size(), + 0, + paddle::platform::errors::InvalidArgument( + "kernel num must be greater than 0, now is %d", kernels_.size())); + is_init_ = true; + + auto& cache = AutoTuneCache::Instance().Get(algo); + if (cache.Find(key)) { + auto best_idx = cache.Get(key); + kernels_[best_idx].Run(args...); + } else { + bool use_autotune = AutoTuneStatus::Instance().UseAutoTune(); + if (use_autotune) { + // All avaliable kernels have ran while picking the best kernel, + // so there may be no need for another kernel run. + auto best_idx = PickBestKernel(ctx, args...); + cache.Set(key, best_idx); + } else { + kernels_[0].Run(args...); + } + } } + private: + bool is_init_{false}; + std::vector kernels_; + mutable std::mutex mutex_; + template - int PickBestKernel(const Context& ctx, Args&&... args) { + size_t PickBestKernel(const Context& ctx, Args&&... args) { + std::lock_guard lock(mutex_); PADDLE_ENFORCE_GT( kernels_.size(), 0, paddle::platform::errors::InvalidArgument( "kernel num must be greater than 0, now is %d", kernels_.size())); - int best_idx = 0; + size_t best_idx = 0; float min_time = std::numeric_limits::max(); // Time cost test estabulished in default stream. @@ -92,23 +118,15 @@ class AutoTuneBase { return best_idx; } - bool IsInit() { return is_init_; } - void Finalize() { is_init_ = true; } - - private: - bool is_init_{false}; - std::vector kernels_; - template float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) { + // Regard 1st run as warmup. Judge the result by the time cost of rest run + // cycles. + constexpr int repeats = 3; phi::GpuTimer timer; float time_cost = 0; const auto& stream = ctx.stream(); - // Treat 1st run as warm up. Judge the result with - // the sum of 2nd and 3rd run. - constexpr int repeats = 3; - ctx.Wait(); for (int i = 0; i < repeats; ++i) { timer.Start(stream); @@ -151,7 +169,7 @@ std::once_flag TransposeAutoTuner::init_flag_; template static AutoTuneBase>* - MakeTransposeTuner(RetureType (*func)(Args...)) { +MakeTransposeTuner(RetureType (*func)(Args...)) { auto obj = MakeCallback(func); return TransposeAutoTuner::Instance(obj); } diff --git a/paddle/phi/kernels/autotune/auto_tune_test.cu b/paddle/phi/kernels/autotune/auto_tune_test.cu index d80790dbf2c15..2ac7b0b8b7509 100644 --- a/paddle/phi/kernels/autotune/auto_tune_test.cu +++ b/paddle/phi/kernels/autotune/auto_tune_test.cu @@ -131,24 +131,5 @@ TEST(AutoTune, sum) { timer.Stop(0); VLOG(3) << "kernel[" << i << "]: time cost is " << timer.ElapsedTime(); } - - // 2. Test call_back tune. - VLOG(3) << ">>> [AutoTune]: Test case."; - auto tuner = tune::MakeAutoTuner(Algo<4>); - tuner.AddCallBack(tune::MakeCallback(Algo<2>)); - tuner.AddCallBack(tune::MakeCallback(Algo<1>)); - - /* The 1st ctx works for ctx.Wait(), - the 2nd is just the param of call_back. */ - auto best_index = tuner.PickBestKernel( - *dev_ctx, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks); - - dev_ctx->Wait(); - phi::GpuTimer timer; - timer.Start(0); - tuner.RunBestKernel( - best_index, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks); - timer.Stop(0); - VLOG(3) << "Best CallBackKernel time cost is " << timer.ElapsedTime(); #endif } diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc index 5e2c9e1c742ff..838f2dd265eb3 100644 --- a/paddle/phi/kernels/autotune/cache.cc +++ b/paddle/phi/kernels/autotune/cache.cc @@ -36,6 +36,13 @@ size_t ConvKey(const std::vector& x_dims, static_cast(dtype)); } +size_t TransposeKey(const std::vector& x_dims, + const std::vector& perm, + phi::DataType dtype) { + const auto rank = perm.size(); + return GetKey(x_dims, perm, rank, static_cast(dtype)); +} + std::string AlgorithmTypeString(int64_t algo_type) { if (algo_type == static_cast(AlgorithmType::kConvForward)) { return "conv_forward"; diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h index 8de0695ede40c..1263cf40e567e 100644 --- a/paddle/phi/kernels/autotune/cache.h +++ b/paddle/phi/kernels/autotune/cache.h @@ -68,6 +68,10 @@ size_t ConvKey(const std::vector& x_dims, const std::vector& dilations, phi::DataType dtype); +size_t TransposeKey(const std::vector& x_dims, + const std::vector& perm, + phi::DataType dtype); + template class AlgorithmsCache { public: diff --git a/paddle/phi/kernels/autotune/switch_autotune.cc b/paddle/phi/kernels/autotune/switch_autotune.cc index 6fda24ef3c860..3742749b3bf03 100644 --- a/paddle/phi/kernels/autotune/switch_autotune.cc +++ b/paddle/phi/kernels/autotune/switch_autotune.cc @@ -29,6 +29,7 @@ void AutoTuneStatus::EnableAutoTune() { void AutoTuneStatus::DisableAutoTune() { FLAGS_use_autotune = false; + use_autotune_ = false; Init(); } diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu index 62e29950e2d89..3f3760a4890a2 100644 --- a/paddle/phi/kernels/gpu/transpose_kernel.cu +++ b/paddle/phi/kernels/gpu/transpose_kernel.cu @@ -31,12 +31,11 @@ void TransposeKernel(const Context& ctx, const DenseTensor& x, const std::vector& axis, DenseTensor* out) { - int rank = axis.size(); ctx.template Alloc(out); if (out->numel() == 0) { return; } - paddle::operators::TransposeGPUKernelDriver(ctx, rank, x, axis, out); + paddle::operators::TransposeGPUKernelDriver(ctx, x, axis, out); } } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index d9e293ba67159..fb48f63185075 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -126,6 +126,41 @@ def initTestCase(self): self.axis = (6, 1, 3, 5, 0, 2, 4, 7) +class TestAutoTuneTransposeOp(OpTest): + + def setUp(self): + self.init_op_type() + self.initTestCase() + self.python_api = paddle.transpose + self.inputs = {'X': np.random.random(self.shape).astype("float64")} + self.attrs = { + 'axis': list(self.axis), + 'use_mkldnn': self.use_mkldnn, + } + self.outputs = { + 'XShape': np.random.random(self.shape).astype("float64"), + 'Out': self.inputs['X'].transpose(self.axis) + } + + def initTestCase(self): + fluid.core.set_autotune_range(0, 3) + fluid.core.update_autotune_status() + fluid.core.enable_autotune() + self.shape = (1, 12, 256, 1) + self.axis = (0, 3, 2, 1) + + def init_op_type(self): + self.op_type = "transpose2" + self.use_mkldnn = False + + def test_check_output(self): + self.check_output(no_check_set=['XShape'], check_eager=True) + fluid.core.disable_autotune() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', check_eager=True) + + class TestTransposeBF16Op(OpTest): def setUp(self): From 5dfc0cd677758f1bdbcd6e66a2cd7369cd968062 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 1 Jul 2022 15:11:22 +0800 Subject: [PATCH 027/250] [Phi] Move all yaml to phi subdirectory (#43986) * move yaml to phi * fix undef var error --- .gitignore | 2 +- .../final_state_generator/CMakeLists.txt | 6 +- paddle/phi/api/lib/CMakeLists.txt | 65 ++++++++++--------- .../code_gen => paddle/phi/api/yaml}/api.yaml | 0 .../phi/api/yaml/api_compat.yaml | 0 .../phi/api/yaml}/api_version.yaml | 0 .../phi/api/yaml}/backward.yaml | 0 .../phi/api/yaml/generator}/api_base.py | 0 .../phi/api/yaml/generator}/api_gen.py | 6 +- .../api/yaml/generator}/backward_api_gen.py | 6 +- .../phi/api/yaml/generator}/cross_validate.py | 0 .../phi/api/yaml/generator}/filters.py | 0 .../phi/api/yaml/generator}/generate_op.py | 8 +-- .../yaml/generator}/intermediate_api_gen.py | 4 +- .../phi/api/yaml/generator}/parse_api.py | 0 .../phi/api/yaml/generator}/parse_utils.py | 0 .../phi/api/yaml/generator}/sparse_api_gen.py | 2 +- .../api/yaml/generator}/sparse_bw_api_gen.py | 7 +- .../api/yaml/generator}/strings_api_gen.py | 2 +- .../phi/api/yaml/generator}/templates/ks.c.j2 | 2 +- .../phi/api/yaml/generator}/templates/op.c.j2 | 2 +- .../generator}/templates/operator_utils.c.j2 | 0 .../phi/api/yaml/generator}/tests.py | 0 .../phi/api/yaml/generator}/type_mapping.py | 0 .../yaml/generator}/wrapped_infermeta_gen.py | 2 +- .../phi/api/yaml}/legacy_api.yaml | 0 .../phi/api/yaml}/legacy_backward.yaml | 0 .../phi/api/yaml}/sparse_api.yaml | 0 .../phi/api/yaml}/sparse_bw_api.yaml | 0 .../phi/api/yaml}/strings_api.yaml | 0 tools/infrt/generate_phi_kernel_dialect.py | 5 +- tools/infrt/get_phi_kernel_function.sh | 4 +- tools/infrt/get_phi_kernel_info.py | 4 +- 33 files changed, 63 insertions(+), 64 deletions(-) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml}/api.yaml (100%) rename python/paddle/utils/code_gen/args_compat.yaml => paddle/phi/api/yaml/api_compat.yaml (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml}/api_version.yaml (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml}/backward.yaml (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/api_base.py (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/api_gen.py (98%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/backward_api_gen.py (98%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/cross_validate.py (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/filters.py (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/generate_op.py (97%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/intermediate_api_gen.py (97%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/parse_api.py (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/parse_utils.py (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/sparse_api_gen.py (99%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/sparse_bw_api_gen.py (97%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/strings_api_gen.py (99%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/templates/ks.c.j2 (85%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/templates/op.c.j2 (93%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/templates/operator_utils.c.j2 (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/tests.py (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/type_mapping.py (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml/generator}/wrapped_infermeta_gen.py (98%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml}/legacy_api.yaml (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml}/legacy_backward.yaml (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml}/sparse_api.yaml (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml}/sparse_bw_api.yaml (100%) rename {python/paddle/utils/code_gen => paddle/phi/api/yaml}/strings_api.yaml (100%) diff --git a/.gitignore b/.gitignore index b8a2e8fbce933..25ecd77e25de9 100644 --- a/.gitignore +++ b/.gitignore @@ -70,4 +70,4 @@ paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h # these files (directories) are generated before build system generation paddle/fluid/operators/generated_op.cc paddle/phi/ops/compat/generated_sig.cc -python/paddle/utils/code_gen/parsed_apis/ +paddle/phi/api/yaml/parsed_apis/ diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index 2a7e9b1cde181..8967354d244aa 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -1,8 +1,8 @@ set(api_yaml_path - "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/legacy_api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml" + "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/api.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_api.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_api.yaml" ) set(backward_yaml_path - "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/legacy_backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml" + "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_bw_api.yaml" ) set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc" @@ -30,7 +30,7 @@ set(nodes_h_path ) # StringTensor only needs forward api set(fwd_api_yaml_path - "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml") + "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/strings_api.yaml") message("Final State Eager CodeGen") add_custom_target( diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 750614561c520..2a1a6b4e78bd5 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -17,13 +17,13 @@ else() DEPS tensor_base dense_tensor phi_api_utils phi_enforce) endif() -set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py) +set(api_gen_base ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/api_base.py) # forward api file -set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py) -set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml) +set(api_gen_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/api_gen.py) +set(api_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/api.yaml) set(legacy_api_yaml_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/legacy_api.yaml) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_api.yaml) set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/api.h) set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/api.cc) set(api_header_file_tmp ${api_header_file}.tmp) @@ -31,11 +31,10 @@ set(api_source_file_tmp ${api_source_file}.tmp) # backward api file set(bw_api_gen_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward_api_gen.py) -set(bw_api_yaml_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/backward_api_gen.py) +set(bw_api_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml) set(legacy_bw_api_yaml_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/legacy_backward.yaml) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml) set(bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/backward_api.h) set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc) @@ -44,7 +43,7 @@ set(bw_api_source_file_tmp ${bw_api_source_file}.tmp) # dygraph(intermediate) api file set(im_api_gen_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/intermediate_api_gen.py) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/intermediate_api_gen.py) set(dygraph_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/dygraph_api.h) set(dygraph_api_source_file @@ -54,9 +53,9 @@ set(dygraph_api_source_file_tmp ${dygraph_api_source_file}.tmp) # sparse api file set(sparse_api_gen_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api_gen.py) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/sparse_api_gen.py) set(sparse_api_yaml_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_api.yaml) set(sparse_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h) set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc) @@ -65,9 +64,9 @@ set(sparse_api_source_file_tmp ${sparse_api_source_file}.tmp) # sparse bw api file set(sparse_bw_api_gen_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py) set(sparse_bw_api_yaml_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_bw_api.yaml) set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h) set(sparse_bw_api_source_file @@ -77,9 +76,9 @@ set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp) # strings api file set(strings_api_gen_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api_gen.py) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/strings_api_gen.py) set(strings_api_yaml_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/strings_api.yaml) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/strings_api.yaml) set(strings_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/strings_api.h) set(strings_api_source_file @@ -89,7 +88,7 @@ set(strings_api_source_file_tmp ${strings_api_source_file}.tmp) # wrapped infermeta file set(wrapped_infermeta_gen_file - ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py) + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py) set(wrapped_infermeta_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h) set(wrapped_infermeta_source_file @@ -109,7 +108,7 @@ else() endif() # parse apis -set(parsed_api_dir ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/parsed_apis) +set(parsed_api_dir ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/parsed_apis) set(generated_op_path ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op.cc) set(generated_argument_mapping_path @@ -121,18 +120,20 @@ message( - ${bw_api_yaml_file} - ${legacy_bw_api_yaml_file}") execute_process( - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_api_dir} - COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./api.yaml + COMMAND ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path ./api.yaml --output_path ./parsed_apis/api.parsed.yaml - COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./legacy_api.yaml - --output_path ./parsed_apis/legacy_api.parsed.yaml - COMMAND ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./backward.yaml - --output_path ./parsed_apis/backward_api.parsed.yaml --backward + COMMAND ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path + ./legacy_api.yaml --output_path ./parsed_apis/legacy_api.parsed.yaml COMMAND - ${PYTHON_EXECUTABLE} parse_api.py --api_yaml_path ./legacy_backward.yaml - --output_path ./parsed_apis/legacy_backward_api.parsed.yaml --backward - RESULTS_VARIABLE _results) + ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path ./backward.yaml + --output_path ./parsed_apis/backward_api.parsed.yaml --backward + COMMAND + ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path + ./legacy_backward.yaml --output_path + ./parsed_apis/legacy_backward_api.parsed.yaml --backward RESULTS_VARIABLE + _results) foreach(_result in ${_results}) if(${_result}) message(FATAL_ERROR "api yaml parsing failed, exiting.") @@ -144,9 +145,9 @@ message("validate api yaml: - ${parsed_api_dir}/api.parsed.yaml - ${parsed_api_dir}/backward_api.parsed.yaml") execute_process( - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml COMMAND - ${PYTHON_EXECUTABLE} cross_validate.py --forward_yaml_paths + ${PYTHON_EXECUTABLE} generator/cross_validate.py --forward_yaml_paths ./parsed_apis/api.parsed.yaml ./parsed_apis/legacy_api.parsed.yaml --backward_yaml_paths ./parsed_apis/backward_api.parsed.yaml ./parsed_apis/legacy_backward_api.parsed.yaml @@ -161,13 +162,13 @@ message( create or remove auto-geneated argument mappings: ${generated_argument_mapping_path}.tmp" ) execute_process( - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml COMMAND - ${PYTHON_EXECUTABLE} generate_op.py --api_yaml_path + ${PYTHON_EXECUTABLE} generator/generate_op.py --api_yaml_path ./parsed_apis/api.parsed.yaml --backward_api_yaml_path ./parsed_apis/backward_api.parsed.yaml --api_version_yaml_path - api_version.yaml --api_args_compat_yaml_path args_compat.yaml - --output_op_path "${generated_op_path}.tmp" --output_arg_map_path + api_version.yaml --api_compat_yaml_path api_compat.yaml --output_op_path + "${generated_op_path}.tmp" --output_arg_map_path "${generated_argument_mapping_path}.tmp" RESULT_VARIABLE _result) if(${_result}) diff --git a/python/paddle/utils/code_gen/api.yaml b/paddle/phi/api/yaml/api.yaml similarity index 100% rename from python/paddle/utils/code_gen/api.yaml rename to paddle/phi/api/yaml/api.yaml diff --git a/python/paddle/utils/code_gen/args_compat.yaml b/paddle/phi/api/yaml/api_compat.yaml similarity index 100% rename from python/paddle/utils/code_gen/args_compat.yaml rename to paddle/phi/api/yaml/api_compat.yaml diff --git a/python/paddle/utils/code_gen/api_version.yaml b/paddle/phi/api/yaml/api_version.yaml similarity index 100% rename from python/paddle/utils/code_gen/api_version.yaml rename to paddle/phi/api/yaml/api_version.yaml diff --git a/python/paddle/utils/code_gen/backward.yaml b/paddle/phi/api/yaml/backward.yaml similarity index 100% rename from python/paddle/utils/code_gen/backward.yaml rename to paddle/phi/api/yaml/backward.yaml diff --git a/python/paddle/utils/code_gen/api_base.py b/paddle/phi/api/yaml/generator/api_base.py similarity index 100% rename from python/paddle/utils/code_gen/api_base.py rename to paddle/phi/api/yaml/generator/api_base.py diff --git a/python/paddle/utils/code_gen/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py similarity index 98% rename from python/paddle/utils/code_gen/api_gen.py rename to paddle/phi/api/yaml/generator/api_gen.py index a0775dd4c0a78..0893d0d5578f9 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/paddle/phi/api/yaml/generator/api_gen.py @@ -154,7 +154,7 @@ def gene_output(self, 0] == 'dense' else 'SetSelectedRowsKernelOutput' if return_type == 'std::vector': assert self.outputs['out_size_expr'][0] is not None, \ - f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." + f"{self.api}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." output_create = output_create + f""" {code_indent} auto kernel_out = {set_out_func}({self.outputs['out_size_expr'][0]}, kernel_backend, &api_output);""" @@ -199,7 +199,7 @@ def gene_output(self, if out_dtype_list[i] == 'std::vector': assert self.outputs['out_size_expr'][i] is not None, \ - f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." + f"{self.api}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." output_create = output_create + f""" {code_indent} auto kernel_out_{i} = {set_out_func}({self.outputs['out_size_expr'][i]}, kernel_backend, {get_out_code});""" @@ -313,7 +313,7 @@ def main(): parser.add_argument('--api_yaml_path', help='path to api yaml file', nargs='+', - default='python/paddle/utils/code_gen/api.yaml') + default='paddle/phi/api/yaml/api.yaml') parser.add_argument('--api_header_path', help='output of generated api header code file', diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/paddle/phi/api/yaml/generator/backward_api_gen.py similarity index 98% rename from python/paddle/utils/code_gen/backward_api_gen.py rename to paddle/phi/api/yaml/generator/backward_api_gen.py index 2439eff9f63e5..67d47a8ec7432 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/paddle/phi/api/yaml/generator/backward_api_gen.py @@ -133,7 +133,7 @@ def gene_output(self, 0] == 'dense' else 'SetSelectedRowsKernelOutput' if out_dtype_list[0] == 'std::vector': assert self.outputs['out_size_expr'] is not None, \ - f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." + f"{self.api}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." output_create = output_create + f""" {code_indent} auto kernel_out = {set_out_func}(&{self.outputs['names'][0]});""" @@ -164,7 +164,7 @@ def gene_output(self, {code_indent} *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};""" assert self.outputs['out_size_expr'][i] is not None, \ - f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." + f"{self.api}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api." output_create = output_create + f""" {code_indent} auto kernel_out_{i} = {set_out_func}(&{self.outputs['names'][i]});""" @@ -279,7 +279,7 @@ def main(): parser.add_argument('--backward_yaml_path', help='path to backward yaml file', nargs='+', - default='python/paddle/utils/code_gen/backward.yaml') + default='paddle/phi/api/yaml/backward.yaml') parser.add_argument('--backward_header_path', help='output of generated backward header code file', default='paddle/phi/api/backward/backward_api.h') diff --git a/python/paddle/utils/code_gen/cross_validate.py b/paddle/phi/api/yaml/generator/cross_validate.py similarity index 100% rename from python/paddle/utils/code_gen/cross_validate.py rename to paddle/phi/api/yaml/generator/cross_validate.py diff --git a/python/paddle/utils/code_gen/filters.py b/paddle/phi/api/yaml/generator/filters.py similarity index 100% rename from python/paddle/utils/code_gen/filters.py rename to paddle/phi/api/yaml/generator/filters.py diff --git a/python/paddle/utils/code_gen/generate_op.py b/paddle/phi/api/yaml/generator/generate_op.py similarity index 97% rename from python/paddle/utils/code_gen/generate_op.py rename to paddle/phi/api/yaml/generator/generate_op.py index 469e264812760..627051365c3f7 100644 --- a/python/paddle/utils/code_gen/generate_op.py +++ b/paddle/phi/api/yaml/generator/generate_op.py @@ -54,7 +54,7 @@ def restruct_io(api): return api -def main(api_yaml_path, backward_yaml_path, api_args_compat_yaml_path, +def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path, api_version_yaml_path, output_op_path, output_arg_map_path): with open(api_yaml_path, "rt") as f: apis = yaml.safe_load(f) @@ -72,7 +72,7 @@ def main(api_yaml_path, backward_yaml_path, api_args_compat_yaml_path, for api_version in api_versions: forward_api_dict[api_version['api']]['version'] = api_version['version'] - with open(api_args_compat_yaml_path, "rt") as f: + with open(api_compat_yaml_path, "rt") as f: api_args_map = yaml.safe_load(f) # replace args name for OpMaker for api_args in api_args_map: @@ -219,7 +219,7 @@ def main(api_yaml_path, backward_yaml_path, api_args_compat_yaml_path, parser.add_argument('--backward_api_yaml_path', type=str, help="parsed backward api yaml file.") - parser.add_argument('--api_args_compat_yaml_path', + parser.add_argument('--api_compat_yaml_path', type=str, help="api args compat yaml file.") parser.add_argument('--api_version_yaml_path', @@ -235,5 +235,5 @@ def main(api_yaml_path, backward_yaml_path, api_args_compat_yaml_path, args = parser.parse_args() main(args.api_yaml_path, args.backward_api_yaml_path, - args.api_args_compat_yaml_path, args.api_version_yaml_path, + args.api_compat_yaml_path, args.api_version_yaml_path, args.output_op_path, args.output_arg_map_path) diff --git a/python/paddle/utils/code_gen/intermediate_api_gen.py b/paddle/phi/api/yaml/generator/intermediate_api_gen.py similarity index 97% rename from python/paddle/utils/code_gen/intermediate_api_gen.py rename to paddle/phi/api/yaml/generator/intermediate_api_gen.py index 017099a64a344..c8ba88d054ac7 100644 --- a/python/paddle/utils/code_gen/intermediate_api_gen.py +++ b/paddle/phi/api/yaml/generator/intermediate_api_gen.py @@ -134,11 +134,11 @@ def main(): parser.add_argument('--api_yaml_path', nargs='+', help='path to api yaml file', - default='python/paddle/utils/code_gen/api.yaml') + default='paddle/phi/api/yaml/api.yaml') parser.add_argument('--sparse_api_yaml_path', help='path to sparse api yaml file', - default='python/paddle/utils/code_gen/sparse_api.yaml') + default='paddle/phi/api/yaml/sparse_api.yaml') parser.add_argument('--dygraph_api_header_path', help='output of generated dygraph api header code file', diff --git a/python/paddle/utils/code_gen/parse_api.py b/paddle/phi/api/yaml/generator/parse_api.py similarity index 100% rename from python/paddle/utils/code_gen/parse_api.py rename to paddle/phi/api/yaml/generator/parse_api.py diff --git a/python/paddle/utils/code_gen/parse_utils.py b/paddle/phi/api/yaml/generator/parse_utils.py similarity index 100% rename from python/paddle/utils/code_gen/parse_utils.py rename to paddle/phi/api/yaml/generator/parse_utils.py diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/paddle/phi/api/yaml/generator/sparse_api_gen.py similarity index 99% rename from python/paddle/utils/code_gen/sparse_api_gen.py rename to paddle/phi/api/yaml/generator/sparse_api_gen.py index aa087c9136b13..17eb70e5c3e5f 100644 --- a/python/paddle/utils/code_gen/sparse_api_gen.py +++ b/paddle/phi/api/yaml/generator/sparse_api_gen.py @@ -280,7 +280,7 @@ def main(): description='Generate PaddlePaddle C++ Sparse API files') parser.add_argument('--api_yaml_path', help='path to sparse api yaml file', - default='python/paddle/utils/code_gen/sparse_api.yaml') + default='paddle/phi/api/yaml/sparse_api.yaml') parser.add_argument('--api_header_path', help='output of generated api header code file', diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py similarity index 97% rename from python/paddle/utils/code_gen/sparse_bw_api_gen.py rename to paddle/phi/api/yaml/generator/sparse_bw_api_gen.py index 834e3d45d0b85..e30c5e3c5d05c 100644 --- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py +++ b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py @@ -162,10 +162,9 @@ def generate_api(api_yaml_path, header_file_path, source_file_path): def main(): parser = argparse.ArgumentParser( description='Generate PaddlePaddle C++ Sparse API files') - parser.add_argument( - '--api_yaml_path', - help='path to sparse api yaml file', - default='python/paddle/utils/code_gen/sparse_bw_api.yaml') + parser.add_argument('--api_yaml_path', + help='path to sparse api yaml file', + default='paddle/phi/api/yaml/sparse_bw_api.yaml') parser.add_argument('--api_header_path', help='output of generated api header code file', diff --git a/python/paddle/utils/code_gen/strings_api_gen.py b/paddle/phi/api/yaml/generator/strings_api_gen.py similarity index 99% rename from python/paddle/utils/code_gen/strings_api_gen.py rename to paddle/phi/api/yaml/generator/strings_api_gen.py index 1f3ec587d7413..bb5a7a2413d8e 100644 --- a/python/paddle/utils/code_gen/strings_api_gen.py +++ b/paddle/phi/api/yaml/generator/strings_api_gen.py @@ -351,7 +351,7 @@ def main(): description='Generate PaddlePaddle C++ Strings API files') parser.add_argument('--api_yaml_path', help='path to sparse api yaml file', - default='python/paddle/utils/code_gen/strings_api.yaml') + default='paddle/phi/api/yaml/strings_api.yaml') parser.add_argument('--api_header_path', help='output of generated api header code file', diff --git a/python/paddle/utils/code_gen/templates/ks.c.j2 b/paddle/phi/api/yaml/generator/templates/ks.c.j2 similarity index 85% rename from python/paddle/utils/code_gen/templates/ks.c.j2 rename to paddle/phi/api/yaml/generator/templates/ks.c.j2 index 54618f0e1e6a1..2855e05b3ca53 100644 --- a/python/paddle/utils/code_gen/templates/ks.c.j2 +++ b/paddle/phi/api/yaml/generator/templates/ks.c.j2 @@ -1,5 +1,5 @@ {% from "operator_utils.c.j2" import name_map, register_name_map %} -// this file is generated by python/paddle/utils/code_gen/generate_op.py, do not edit. +// this file is generated by paddle/phi/api/yaml/generator/generate_op.py, do not edit. #include "paddle/phi/core/compat/op_utils.h" #include "paddle/utils/small_vector.h" diff --git a/python/paddle/utils/code_gen/templates/op.c.j2 b/paddle/phi/api/yaml/generator/templates/op.c.j2 similarity index 93% rename from python/paddle/utils/code_gen/templates/op.c.j2 rename to paddle/phi/api/yaml/generator/templates/op.c.j2 index 5c9559d1c89f8..7f13eb9582589 100644 --- a/python/paddle/utils/code_gen/templates/op.c.j2 +++ b/paddle/phi/api/yaml/generator/templates/op.c.j2 @@ -1,5 +1,5 @@ {% from "operator_utils.c.j2" import op_maker, backward_op_maker, operator, register_op_with_components, register_op_version %} -// this file is generated by python/paddle/utils/code_gen/generate_op.py, do not edit. +// this file is generated by paddle/phi/api/yaml/generator/generate_op.py, do not edit. #include #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/python/paddle/utils/code_gen/templates/operator_utils.c.j2 b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 similarity index 100% rename from python/paddle/utils/code_gen/templates/operator_utils.c.j2 rename to paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 diff --git a/python/paddle/utils/code_gen/tests.py b/paddle/phi/api/yaml/generator/tests.py similarity index 100% rename from python/paddle/utils/code_gen/tests.py rename to paddle/phi/api/yaml/generator/tests.py diff --git a/python/paddle/utils/code_gen/type_mapping.py b/paddle/phi/api/yaml/generator/type_mapping.py similarity index 100% rename from python/paddle/utils/code_gen/type_mapping.py rename to paddle/phi/api/yaml/generator/type_mapping.py diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py similarity index 98% rename from python/paddle/utils/code_gen/wrapped_infermeta_gen.py rename to paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py index 56a55cfe80629..99da6ce3d955f 100644 --- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py +++ b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py @@ -160,7 +160,7 @@ def main(): parser.add_argument('--api_yaml_path', help='path to api yaml file', nargs='+', - default='python/paddle/utils/code_gen/api.yaml') + default='paddle/phi/api/yaml/api.yaml') parser.add_argument( '--wrapped_infermeta_header_path', help='output of generated wrapped_infermeta header code file', diff --git a/python/paddle/utils/code_gen/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml similarity index 100% rename from python/paddle/utils/code_gen/legacy_api.yaml rename to paddle/phi/api/yaml/legacy_api.yaml diff --git a/python/paddle/utils/code_gen/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml similarity index 100% rename from python/paddle/utils/code_gen/legacy_backward.yaml rename to paddle/phi/api/yaml/legacy_backward.yaml diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml similarity index 100% rename from python/paddle/utils/code_gen/sparse_api.yaml rename to paddle/phi/api/yaml/sparse_api.yaml diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml similarity index 100% rename from python/paddle/utils/code_gen/sparse_bw_api.yaml rename to paddle/phi/api/yaml/sparse_bw_api.yaml diff --git a/python/paddle/utils/code_gen/strings_api.yaml b/paddle/phi/api/yaml/strings_api.yaml similarity index 100% rename from python/paddle/utils/code_gen/strings_api.yaml rename to paddle/phi/api/yaml/strings_api.yaml diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py index c0a5139313029..39b0d5484a8ff 100644 --- a/tools/infrt/generate_phi_kernel_dialect.py +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -72,12 +72,11 @@ def get_skipped_kernel_list(): def get_api_yaml_info(file_path): apis = [] - with open(file_path + "/python/paddle/utils/code_gen/api.yaml", 'r') as f: + with open(file_path + "/paddle/phi/api/yaml/api.yaml", 'r') as f: api_list = yaml.load(f, Loader=yaml.FullLoader) if api_list: apis.extend(api_list) - with open(file_path + "/python/paddle/utils/code_gen/legacy_api.yaml", - 'r') as f: + with open(file_path + "/paddle/phi/api/yaml/legacy_api.yaml", 'r') as f: legacy_api_list = yaml.load(f, Loader=yaml.FullLoader) if legacy_api_list: apis.extend(legacy_api_list) diff --git a/tools/infrt/get_phi_kernel_function.sh b/tools/infrt/get_phi_kernel_function.sh index 92076803cf65e..69926e28cb54b 100644 --- a/tools/infrt/get_phi_kernel_function.sh +++ b/tools/infrt/get_phi_kernel_function.sh @@ -77,8 +77,8 @@ done #step 2:get simple general inferMeta function wrap info temp_path=`mktemp -d` -python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \ - --api_yaml_path ${PADDLE_ROOT}/python/paddle/utils/code_gen/api.yaml ${PADDLE_ROOT}/python/paddle/utils/code_gen/legacy_api.yaml \ +python3 ${PADDLE_ROOT}/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py \ + --api_yaml_path ${PADDLE_ROOT}/paddle/phi/api/yaml/api.yaml ${PADDLE_ROOT}/paddle/phi/api/yaml/legacy_api.yaml \ --wrapped_infermeta_header_path ${temp_path}/generate.h \ --wrapped_infermeta_source_path ${temp_path}/generate.cc diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py index 6c4f40d215fc1..4837ca582135c 100644 --- a/tools/infrt/get_phi_kernel_info.py +++ b/tools/infrt/get_phi_kernel_info.py @@ -20,8 +20,8 @@ from typing import List, Dict, Any skipped_phi_api_list_file = "/tools/infrt/skipped_phi_api.json" -api_yaml_file = "/python/paddle/utils/code_gen/api.yaml" -legacy_api_yaml_file = "/python/paddle/utils/code_gen/legacy_api.yaml" +api_yaml_file = "/paddle/phi/api/yaml/api.yaml" +legacy_api_yaml_file = "/paddle/phi/api/yaml/legacy_api.yaml" def get_skipped_kernel_list(): From f3bdabc1ceee5a2df918a91f7f89b119e9a0207e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Fri, 1 Jul 2022 16:10:33 +0800 Subject: [PATCH 028/250] fixes a bug, test=develop (#43970) --- python/paddle/fluid/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 3d071fce6c77e..7a39c2bc2fbc0 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -376,7 +376,7 @@ def name_has_fc(var): vars=list(filter(predicate, main_program.list_vars())), filename=filename) else: - params_var_name = unique_name.generate("saved_params") + params_var_name = "saved_params" # give warning when there is no var in model if len(list(vars)) == 0: warnings.warn( From 692466b4c700b9e369efead356aebf8a2022f9fb Mon Sep 17 00:00:00 2001 From: Jiaqi Liu <709153940@qq.com> Date: Fri, 1 Jul 2022 16:16:08 +0800 Subject: [PATCH 029/250] Make accuracy function support dtype int64 for input label (#43003) * support int64 for acc * support int64 for acc --- python/paddle/metric/metrics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py index 4d28b68f99456..919daa31d06fc 100644 --- a/python/paddle/metric/metrics.py +++ b/python/paddle/metric/metrics.py @@ -776,7 +776,7 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None): Args: input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64. The shape is ``[sample_number, class_dim]`` . - label(Tensor): The label of dataset. Tensor with type int64. The shape is ``[sample_number, 1]`` . + label(Tensor): The label of dataset. Tensor with type int64 or int32. The shape is ``[sample_number, 1]`` . k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32. correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32. total(Tensor, optional): The total entries count. A tensor with type int64 or int32. @@ -796,6 +796,8 @@ def accuracy(input, label, k=1, correct=None, total=None, name=None): result = paddle.metric.accuracy(input=predictions, label=label, k=1) # [0.5] """ + if label.dtype == paddle.int32: + label = paddle.cast(label, paddle.int64) if _non_static_mode(): if correct is None: correct = _varbase_creator(dtype="int32") From 37f2151f6d1ff4e9c4abe7a9893be1165c1c2edc Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Fri, 1 Jul 2022 16:40:53 +0800 Subject: [PATCH 030/250] [Err msg] optimize kernel call error message (#44001) --- paddle/fluid/imperative/prepared_operator.h | 52 ++++++++++++--------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index f45b72055ec4e..7ed4346ed82c2 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -257,29 +257,35 @@ void BuildDygraphPhiKernelContext(const phi::KernelSignature& kernel_signature, auto& output_defs = phi_kernel.args_def().output_defs(); auto& attr_defs = phi_kernel.args_def().attribute_defs(); - PADDLE_ENFORCE_EQ(input_names.size(), - input_defs.size(), - platform::errors::InvalidArgument( - "the size of inputs_args names (%d) must be equal to " - "the size of kernel input_defs (%d).", - input_names.size(), - input_defs.size())); - - PADDLE_ENFORCE_EQ(output_names.size(), - output_defs.size(), - platform::errors::InvalidArgument( - "the size of outputs_args names (%d) must be equal to " - "the size of kernel output_defs (%d).", - output_names.size(), - output_defs.size())); - - PADDLE_ENFORCE_EQ(attr_names.size(), - attr_defs.size(), - platform::errors::InvalidArgument( - "the size of attribute_args names (%d) must be equal " - "to the size of kernel attribute_defs (%d).", - attr_names.size(), - attr_defs.size())); + PADDLE_ENFORCE_EQ( + input_names.size(), + input_defs.size(), + platform::errors::InvalidArgument( + "Op %s: the size of inputs_args names (%d) must be equal to " + "the size of kernel input_defs (%d).", + kernel_signature.name, + input_names.size(), + input_defs.size())); + + PADDLE_ENFORCE_EQ( + output_names.size(), + output_defs.size(), + platform::errors::InvalidArgument( + "Op %s: the size of outputs_args names (%d) must be equal to " + "the size of kernel output_defs (%d).", + kernel_signature.name, + output_names.size(), + output_defs.size())); + + PADDLE_ENFORCE_EQ( + attr_names.size(), + attr_defs.size(), + platform::errors::InvalidArgument( + "Op %s: the size of attribute_args names (%d) must be equal " + "to the size of kernel attribute_defs (%d).", + kernel_signature.name, + attr_names.size(), + attr_defs.size())); for (size_t i = 0; i < input_names.size(); ++i) { auto it = ins.find(input_names[i]); From 9a1fdad3ae1a1814200856f86744a1d0134bceff Mon Sep 17 00:00:00 2001 From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com> Date: Fri, 1 Jul 2022 16:43:45 +0800 Subject: [PATCH 031/250] update new unittests of flatten ops and layernorm, *test=kunlun (#43895) --- .../unittests/xpu/test_flatten2_op_xpu.py | 93 ++-- .../test_flatten_contiguous_range_op_xpu.py | 416 +++++++++--------- .../unittests/xpu/test_flatten_op_xpu.py | 83 ++-- .../unittests/xpu/test_layer_norm_op_xpu.py | 108 ++--- 4 files changed, 366 insertions(+), 334 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py index 819fd1248fecf..392eed198ff95 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py @@ -23,67 +23,80 @@ import paddle.fluid as fluid from op_test import OpTest from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -class TestFlatten2Op(XPUOpTest): +class XPUTestFlatten2Op(XPUOpTestWrapper): - def setUp(self): - self.set_xpu() - self.op_type = "flatten2" - self.place = paddle.XPUPlace(0) - self.init_test_case() - self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} - self.init_attrs() - self.outputs = { - "Out": self.inputs["X"].reshape(self.new_shape), - "XShape": np.random.random(self.in_shape).astype("float32") - } + def __init__(self): + self.op_name = 'flatten2' + self.use_dynamic_create_class = False - def set_xpu(self): - self.__class__.use_xpu = True + class TestFlatten2Op(XPUOpTest): - def test_check_output(self): - self.check_output_with_place(self.place, no_check_set=["XShape"]) + def setUp(self): + self.set_xpu() + self.op_type = "flatten2" + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.init_test_case() + self.inputs = { + "X": np.random.random(self.in_shape).astype(self.dtype) + } + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.in_shape).astype(self.dtype) + } - def test_check_grad(self): - self.check_grad_with_place(self.place, ["X"], "Out") + def set_xpu(self): + self.__class__.use_xpu = True - def init_test_case(self): - self.in_shape = (3, 2, 4, 5) - self.axis = 1 - self.new_shape = (3, 40) + def test_check_output(self): + self.check_output_with_place(self.place, no_check_set=["XShape"]) - def init_attrs(self): - self.attrs = {"axis": self.axis} + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + def init_test_case(self): + self.in_shape = (3, 2, 4, 5) + self.axis = 1 + self.new_shape = (3, 40) -class TestFlatten2OpWithCornerAxis(TestFlatten2Op): + def init_attrs(self): + self.attrs = {"axis": self.axis} - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.axis = 0 - self.new_shape = (1, 120) + class TestFlatten2OpWithCornerAxis(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.axis = 0 + self.new_shape = (1, 120) -class TestFlatten2OpWithDefaultAxis(TestFlatten2Op): + class TestFlatten2OpWithDefaultAxis(TestFlatten2Op): - def init_test_case(self): - self.in_shape = (10, 2, 2, 3) - self.new_shape = (10, 12) + def init_test_case(self): + self.in_shape = (10, 2, 2, 3) + self.new_shape = (10, 12) - def init_attrs(self): - self.attrs = {} + def init_attrs(self): + self.attrs = {} + class TestFlatten2OpSixDims(TestFlatten2Op): -class TestFlatten2OpSixDims(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.axis = 4 + self.new_shape = (36, 16) - def init_test_case(self): - self.in_shape = (3, 2, 3, 2, 4, 4) - self.axis = 4 - self.new_shape = (36, 16) +support_types = get_xpu_op_support_types('flatten2') +support_types_for_grad = get_xpu_op_support_types('mean') +for stype in support_types: + if stype in support_types_for_grad: + create_test_class(globals(), XPUTestFlatten2Op, stype) if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py index 06fc12f510844..c9426f54b1cf6 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py @@ -17,7 +17,6 @@ import sys sys.path.append("..") - import numpy as np import unittest import sys @@ -27,215 +26,214 @@ from op_test_xpu import XPUOpTest import paddle import paddle.fluid as fluid +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -class TestFlattenOp(XPUOpTest): - - def setUp(self): - self.set_xpu() - self.op_type = "flatten_contiguous_range" - self.place = paddle.XPUPlace(0) - self.use_xpu = True - self.use_mkldnn = False - - self.start_axis = 0 - self.stop_axis = -1 - self.dtype = np.float32 - self.init_test_case() - self.inputs = {"X": np.random.random(self.in_shape).astype(self.dtype)} - self.init_attrs() - self.outputs = { - "Out": self.inputs["X"].reshape(self.new_shape), - "XShape": np.random.random(self.in_shape).astype("float32") - } - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place, no_check_set=["XShape"]) - - def test_check_grad(self): - self.check_grad_with_place(self.place, ["X"], "Out") - - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.start_axis = 0 - self.stop_axis = -1 - self.new_shape = (120) - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis, - 'use_xpu': True, - } - - -class TestFlattenOp_1(TestFlattenOp): - - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.start_axis = 1 - self.stop_axis = 2 - self.new_shape = (3, 10, 4) - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis - } - - -class TestFlattenOp_2(TestFlattenOp): - - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.start_axis = 0 - self.stop_axis = 1 - self.new_shape = (6, 5, 4) - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis - } - - -class TestFlattenOp_3(TestFlattenOp): - - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.start_axis = 0 - self.stop_axis = 2 - self.new_shape = (30, 4) - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis - } - - -class TestFlattenOp_4(TestFlattenOp): - - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.start_axis = -2 - self.stop_axis = -1 - self.new_shape = (3, 2, 20) - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis - } - - -class TestFlattenOp_5(TestFlattenOp): - - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.start_axis = 2 - self.stop_axis = 2 - self.new_shape = (3, 2, 5, 4) - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis - } - - -class TestFlattenOpSixDims(TestFlattenOp): - - def init_test_case(self): - self.in_shape = (3, 2, 3, 2, 4, 4) - self.start_axis = 3 - self.stop_axis = 5 - self.new_shape = (3, 2, 3, 32) - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis - } - - -class TestFlattenOp_Float32(TestFlattenOp): - - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.start_axis = 0 - self.stop_axis = 1 - self.new_shape = (6, 5, 4) - self.dtype = np.float32 - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis - } - - -class TestFlattenOp_int32(TestFlattenOp): - - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.start_axis = 0 - self.stop_axis = 1 - self.new_shape = (6, 5, 4) - self.dtype = np.int32 - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis, - 'use_xpu': True - } - - def test_check_grad(self): - pass - - -class TestFlattenOp_int8(TestFlattenOp): - - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.start_axis = 0 - self.stop_axis = 1 - self.new_shape = (6, 5, 4) - self.dtype = np.int8 - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis - } - - def test_check_grad(self): - pass - - -class TestFlattenOp_int64(TestFlattenOp): - - def init_test_case(self): - self.in_shape = (3, 2, 5, 4) - self.start_axis = 0 - self.stop_axis = 1 - self.new_shape = (6, 5, 4) - self.dtype = np.int64 - - def init_attrs(self): - self.attrs = { - "start_axis": self.start_axis, - "stop_axis": self.stop_axis - } - - def test_check_grad(self): - pass +class XPUTestFlattenOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'flatten_contiguous_range' + self.use_dynamic_create_class = False + + class TestFlattenOp(XPUOpTest): + + def setUp(self): + self.set_xpu() + self.op_type = "flatten_contiguous_range" + self.place = paddle.XPUPlace(0) + self.use_xpu = True + self.use_mkldnn = False + + self.start_axis = 0 + self.stop_axis = -1 + self.dtype = self.in_type + self.init_test_case() + self.inputs = { + "X": np.random.random(self.in_shape).astype(self.dtype) + } + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.in_shape).astype(self.dtype) + } + + def set_xpu(self): + self.__class__.use_xpu = True + + def test_check_output(self): + self.check_output_with_place(self.place, no_check_set=["XShape"]) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = -1 + self.new_shape = (120) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis, + 'use_xpu': True, + } + + class TestFlattenOp_1(TestFlattenOp): + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 1 + self.stop_axis = 2 + self.new_shape = (3, 10, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + class TestFlattenOp_2(TestFlattenOp): + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + class TestFlattenOp_3(TestFlattenOp): + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 2 + self.new_shape = (30, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + class TestFlattenOp_4(TestFlattenOp): + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = -2 + self.stop_axis = -1 + self.new_shape = (3, 2, 20) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + class TestFlattenOp_5(TestFlattenOp): + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 2 + self.stop_axis = 2 + self.new_shape = (3, 2, 5, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + class TestFlattenOpSixDims(TestFlattenOp): + + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.start_axis = 3 + self.stop_axis = 5 + self.new_shape = (3, 2, 3, 32) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + class TestFlattenOp_Float32(TestFlattenOp): + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.float32 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + class TestFlattenOp_int32(TestFlattenOp): + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int32 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis, + 'use_xpu': True + } + + def test_check_grad(self): + pass + + class TestFlattenOp_int8(TestFlattenOp): + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int8 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + def test_check_grad(self): + pass + + class TestFlattenOp_int64(TestFlattenOp): + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int64 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + def test_check_grad(self): + pass class TestFlatten2OpError(unittest.TestCase): @@ -338,5 +336,11 @@ def test_Negative(): self.assertTrue((2, 3, 16) == res_shape) +support_types = get_xpu_op_support_types('flatten_contiguous_range') +support_types_for_grad = get_xpu_op_support_types('mean') +for stype in support_types: + if stype in support_types_for_grad: + create_test_class(globals(), XPUTestFlattenOp, stype) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py index 9622fc5bb1a82..c3c732fa77177 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py @@ -23,61 +23,74 @@ import paddle.fluid as fluid from op_test import OpTest from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -class TestFlattenOp(XPUOpTest): +class XPUTestFlattenOp(XPUOpTestWrapper): - def setUp(self): - self.op_type = "flatten" - self.use_xpu = True - self.place = paddle.XPUPlace(0) - self.init_test_case() - self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} - self.init_attrs() - self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + def __init__(self): + self.op_name = 'flatten' + self.use_dynamic_create_class = False - def test_check_output(self): - self.check_output_with_place(self.place) + class TestFlattenOp(XPUOpTest): - def test_check_grad(self): - self.check_grad_with_place(self.place, ["X"], "Out") + def setUp(self): + self.op_type = "flatten" + self.use_xpu = True + self.place = paddle.XPUPlace(0) + self.init_test_case() + self.dtype = self.in_type + self.inputs = { + "X": np.random.random(self.in_shape).astype(self.dtype) + } + self.init_attrs() + self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} - def init_test_case(self): - self.in_shape = (3, 2, 2, 10) - self.axis = 1 - self.new_shape = (3, 40) + def test_check_output(self): + self.check_output_with_place(self.place) - def init_attrs(self): - self.attrs = {"axis": self.axis} + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + def init_test_case(self): + self.in_shape = (3, 2, 2, 10) + self.axis = 1 + self.new_shape = (3, 40) -class TestFlattenOp1(TestFlattenOp): + def init_attrs(self): + self.attrs = {"axis": self.axis} - def init_test_case(self): - self.in_shape = (3, 2, 2, 10) - self.axis = 0 - self.new_shape = (1, 120) + class TestFlattenOp1(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 2, 10) + self.axis = 0 + self.new_shape = (1, 120) -class TestFlattenOpWithDefaultAxis(TestFlattenOp): + class TestFlattenOpWithDefaultAxis(TestFlattenOp): - def init_test_case(self): - self.in_shape = (10, 2, 2, 3) - self.new_shape = (10, 12) + def init_test_case(self): + self.in_shape = (10, 2, 2, 3) + self.new_shape = (10, 12) - def init_attrs(self): - self.attrs = {} + def init_attrs(self): + self.attrs = {} + class TestFlattenOpSixDims(TestFlattenOp): -class TestFlattenOpSixDims(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.axis = 4 + self.new_shape = (36, 16) - def init_test_case(self): - self.in_shape = (3, 2, 3, 2, 4, 4) - self.axis = 4 - self.new_shape = (36, 16) +support_types = get_xpu_op_support_types('flatten') +support_types_for_grad = get_xpu_op_support_types('mean') +for stype in support_types: + if stype in support_types_for_grad: + create_test_class(globals(), XPUTestFlattenOp, stype) if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py index 1f2caa9fbe9d8..8cab945b45978 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py @@ -20,7 +20,9 @@ sys.path.append("..") from op_test import OpTest +from op_test_xpu import XPUOpTest from operator import mul +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() @@ -42,77 +44,77 @@ def ref_layer_norm(x, scale, bias, epsilon, begin_norm_axis=1): return y, mean, variance -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPULayerNormOp(OpTest): +class XPUTestLayerNormOp(XPUOpTestWrapper): - def setUp(self): - self.op_type = "layer_norm" - self.dtype = np.float32 - self.shape = [2, 3, 4, 5] - self.epsilon = 1e-05 - self.begin_norm_axis = 1 - self.set_attrs() + def __init__(self): + self.op_name = 'layer_norm' + self.use_dynamic_create_class = False - right = reduce(mul, self.shape[self.begin_norm_axis:len(self.shape)], 1) - np.random.seed(10) - x_np = np.random.uniform(0.1, 1, self.shape).astype(self.dtype) - scale_np = np.random.uniform(0.1, 1, [right]).astype(self.dtype) - bias_np = np.random.uniform(0.1, 1, [right]).astype(self.dtype) - ref_y_np, ref_mean_np, ref_variance_np = ref_layer_norm( - x_np, scale_np, bias_np, self.epsilon, self.begin_norm_axis) + class TestXPULayerNormOp(XPUOpTest): - self.inputs = {'X': x_np, 'Scale': scale_np, 'Bias': bias_np} - self.outputs = { - 'Y': ref_y_np, - 'Mean': ref_mean_np, - 'Variance': ref_variance_np - } - self.attrs = {'begin_norm_axis': self.begin_norm_axis, 'use_xpu': True} + def setUp(self): + self.op_type = "layer_norm" + self.dtype = self.in_type + self.shape = [2, 3, 4, 5] + self.epsilon = 1e-05 + self.begin_norm_axis = 1 + self.set_attrs() - def set_attrs(self): - pass + right = reduce(mul, + self.shape[self.begin_norm_axis:len(self.shape)], 1) + np.random.seed(10) + x_np = np.random.uniform(0.1, 1, self.shape).astype(self.dtype) + scale_np = np.random.uniform(0.1, 1, [right]).astype(self.dtype) + bias_np = np.random.uniform(0.1, 1, [right]).astype(self.dtype) + ref_y_np, ref_mean_np, ref_variance_np = ref_layer_norm( + x_np, scale_np, bias_np, self.epsilon, self.begin_norm_axis) - def test_check_output(self): - self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4) + self.inputs = {'X': x_np, 'Scale': scale_np, 'Bias': bias_np} + self.outputs = { + 'Y': ref_y_np, + 'Mean': ref_mean_np, + 'Variance': ref_variance_np + } + self.attrs = { + 'begin_norm_axis': self.begin_norm_axis, + 'use_xpu': True + } - def test_check_grad(self): - self.check_grad_with_place(paddle.XPUPlace(0), ['X'], - 'Y', - max_relative_error=0.02) + def set_attrs(self): + pass + def test_check_output(self): + self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4) -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPULayerNormOpAxis2(TestXPULayerNormOp): + def test_check_grad(self): + self.check_grad_with_place(paddle.XPUPlace(0), ['X'], + 'Y', + max_relative_error=0.02) - def set_attrs(self): - self.begin_norm_axis = 2 + class TestXPULayerNormOpAxis2(TestXPULayerNormOp): + def set_attrs(self): + self.begin_norm_axis = 2 -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPULayerNormOpAxis3(TestXPULayerNormOp): + class TestXPULayerNormOpAxis3(TestXPULayerNormOp): - def set_attrs(self): - self.begin_norm_axis = 3 + def set_attrs(self): + self.begin_norm_axis = 3 + class TestXPULayerNormOp2D(TestXPULayerNormOp): -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPULayerNormOp2D(TestXPULayerNormOp): + def set_attrs(self): + self.shape = [10, 12] - def set_attrs(self): - self.shape = [10, 12] + class TestXPULayerNormOp3D(TestXPULayerNormOp): + def set_attrs(self): + self.shape = [4, 5, 6] -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPULayerNormOp3D(TestXPULayerNormOp): - - def set_attrs(self): - self.shape = [4, 5, 6] +support_types = get_xpu_op_support_types('layer_norm') +for stype in support_types: + create_test_class(globals(), XPUTestLayerNormOp, stype) if __name__ == "__main__": unittest.main() From b9640034d59db748c9aee8724f7360f5ecfe1da4 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Fri, 1 Jul 2022 17:42:00 +0800 Subject: [PATCH 032/250] Make inference_api_test compile with shared lib in Windows (#43946) * Revert "Revert "make inference_api_test compile with dynamic linking library (#41944)" (#43882)" This reverts commit e6d81ddf03b1d7f478ec54308e6b20ac9272cd99. * modify third_party cmake * move SKIP_CPP_TEST return in the begining --- cmake/external/glog.cmake | 2 +- cmake/third_party.cmake | 2 +- paddle/fluid/inference/CMakeLists.txt | 2 + .../fluid/inference/analysis/CMakeLists.txt | 8 +-- paddle/fluid/inference/capi/CMakeLists.txt | 4 -- .../fluid/inference/capi_exp/CMakeLists.txt | 4 -- .../fluid/inference/tests/api/CMakeLists.txt | 68 +++++++++++++------ .../analyzer_image_classification_tester.cc | 10 +-- .../inference/tests/api/trt_fc_prelu_test.cc | 5 ++ .../inference/tests/api/trt_mobilenet_test.cc | 5 ++ .../inference/tests/api/trt_resnext_test.cc | 5 ++ paddle/phi/common/place.h | 5 +- paddle/scripts/paddle_build.bat | 3 +- 13 files changed, 80 insertions(+), 43 deletions(-) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index df1b827ed1824..456c651a197f9 100755 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -45,7 +45,7 @@ if(WITH_ARM_BRPC) file( WRITE ${GLOG_SOURCE_DIR}/CMakeLists.txt "PROJECT(ARM_GLOGS)\n" "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY arm_glog/include arm_glog/lib \n" + "install(DIRECTORY arm_glog/include arm_glog/lib\n" " DESTINATION . USE_SOURCE_PERMISSIONS)\n") ExternalProject_Add( extern_glog diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index b96656778d60c..3cefa0dfa26a2 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -225,7 +225,7 @@ if(NOT DEFINED WITH_MKLDNN) if(WITH_MKL AND AVX2_FOUND) set(WITH_MKLDNN ON) else() - message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN") + message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN.") set(WITH_MKLDNN OFF) endif() endif() diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 4e991a3013875..6ff4655429604 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -119,6 +119,8 @@ cc_library( get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_inference_shared ${os_dependency_modules}) if(WIN32) + set_property(TARGET paddle_inference_shared + PROPERTY WINDOWS_EXPORT_ALL_SYMBOLS ON) target_link_libraries(paddle_inference_shared gflags) endif() diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 4b7bed65bab77..c001f5eb8dfdc 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -49,10 +49,10 @@ function(inference_analysis_test_build TARGET) SRCS ${analysis_test_SRCS} DEPS + ${analysis_test_EXTRA_DEPS} analysis pass - ${GLOB_PASS_LIB} - ${analysis_test_EXTRA_DEPS}) + ${GLOB_PASS_LIB}) endif() endfunction() @@ -80,10 +80,10 @@ function(inference_analysis_test TARGET) SRCS ${analysis_test_SRCS} DEPS + ${analysis_test_EXTRA_DEPS} analysis pass - ${GLOB_PASS_LIB} - ${analysis_test_EXTRA_DEPS}) + ${GLOB_PASS_LIB}) inference_base_test_run(${TARGET} COMMAND ${TARGET} ARGS ${analysis_test_ARGS}) endif() diff --git a/paddle/fluid/inference/capi/CMakeLists.txt b/paddle/fluid/inference/capi/CMakeLists.txt index 73ba41607aae8..25d8a39dc6374 100644 --- a/paddle/fluid/inference/capi/CMakeLists.txt +++ b/paddle/fluid/inference/capi/CMakeLists.txt @@ -20,10 +20,6 @@ cc_library( SRCS ${C_API_SRCS} DEPS paddle_inference) -if(NOT ON_INFER) - return() -endif() - # Create inference capi shared library cc_library( paddle_inference_c_shared SHARED diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt index e35e14a0c0241..56de57cbb9c85 100644 --- a/paddle/fluid/inference/capi_exp/CMakeLists.txt +++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt @@ -20,10 +20,6 @@ cc_library( SRCS ${C_API_SRCS} DEPS paddle_inference) -if(NOT ON_INFER) - return() -endif() - # Create inference capi shared library cc_library( paddle_inference_c_shared SHARED diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 8261ce288cb97..610883ad1ad27 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,13 +1,9 @@ -if(NOT APPLE AND NOT WIN32) - set(INFERENCE_EXTRA_DEPS paddle_inference_shared) -else() - set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_inference_io - ir_pass_manager analysis_predictor benchmark) +# If CI_SKIP_CPP_TEST=ON, there is no need to build and run these test. +if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + return() endif() -if(WITH_GPU AND TENSORRT_FOUND) - set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps}) -endif() +set(INFERENCE_EXTRA_DEPS paddle_inference_shared) function(download_data install_dir data_file check_sum) string(REGEX MATCH "[^/\\]+$" file_name ${data_file}) @@ -948,18 +944,26 @@ if(WITH_GPU AND TENSORRT_FOUND) analyzer_capi_exp_gpu_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + if(WIN32) + target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c_shared) + else() + target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c) + endif() inference_analysis_test( test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + if(WIN32) + target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c_shared) + else() + target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c) + endif() set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") @@ -1107,9 +1111,13 @@ inference_analysis_test( analyzer_capi_exp_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - paddle_inference_c ARGS --infer_model=${RESNET50_MODEL_DIR}/model) +if(WIN32) + target_link_libraries(test_analyzer_capi_exp paddle_inference_c_shared) +else() + target_link_libraries(test_analyzer_capi_exp paddle_inference_c) +endif() inference_analysis_test( test_analyzer_capi_exp_pd_config @@ -1117,9 +1125,14 @@ inference_analysis_test( analyzer_capi_exp_pd_config_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - paddle_inference_c ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) +if(WIN32) + target_link_libraries(test_analyzer_capi_exp_pd_config + paddle_inference_c_shared) +else() + target_link_libraries(test_analyzer_capi_exp_pd_config paddle_inference_c) +endif() inference_analysis_test( test_analyzer_capi_exp_pd_tensor @@ -1127,9 +1140,14 @@ inference_analysis_test( analyzer_capi_exp_pd_tensor_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - paddle_inference_c ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) +if(WIN32) + target_link_libraries(test_analyzer_capi_exp_pd_tensor + paddle_inference_c_shared) +else() + target_link_libraries(test_analyzer_capi_exp_pd_tensor paddle_inference_c) +endif() if(NOT APPLE AND NOT WIN32) inference_analysis_test( @@ -1138,10 +1156,16 @@ if(NOT APPLE AND NOT WIN32) analyzer_capi_exp_pd_threads_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - paddle_inference_c ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) + if(WIN32) + target_link_libraries(test_analyzer_capi_exp_pd_threads + paddle_inference_c_shared) + else() + target_link_libraries(test_analyzer_capi_exp_pd_threads paddle_inference_c) + endif() endif() + inference_analysis_test( test_analyzer_zerocopytensor_tensor SRCS @@ -1182,9 +1206,13 @@ if(WITH_MKLDNN) analyzer_capi_exp_int_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - paddle_inference_c ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model) + if(WIN32) + target_link_libraries(test_analyzer_capi_exp_int paddle_inference_c_shared) + else() + target_link_libraries(test_analyzer_capi_exp_int paddle_inference_c) + endif() endif() inference_analysis_test( @@ -1193,9 +1221,13 @@ inference_analysis_test( analyzer_capi_exp_ner_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - paddle_inference_c ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model) +if(WIN32) + target_link_libraries(test_analyzer_capi_exp_ner paddle_inference_c_shared) +else() + target_link_libraries(test_analyzer_capi_exp_ner paddle_inference_c) +endif() if(WITH_GPU) inference_analysis_test( @@ -1224,10 +1256,6 @@ cc_test( SRCS paddle_infer_api_errors_tester.cc DEPS paddle_inference_api) -if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") - return() -endif() - if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(trt_resnext_test PROPERTIES TIMEOUT 300) set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 300) diff --git a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc index 0df36592cc39e..dc8921ef7311e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc @@ -66,11 +66,6 @@ void profile(bool use_mkldnn = false) { FLAGS_num_threads); } -TEST(Analyzer_resnet50, profile) { profile(); } -#ifdef PADDLE_WITH_MKLDNN -TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); } -#endif - // Check the fuse status TEST(Analyzer_resnet50, fuse_statis) { AnalysisConfig cfg; @@ -82,6 +77,11 @@ TEST(Analyzer_resnet50, fuse_statis) { LOG(INFO) << "num_ops: " << num_ops; } +TEST(Analyzer_resnet50, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); } +#endif + // Compare result of NativeConfig and AnalysisConfig void compare(bool use_mkldnn = false) { AnalysisConfig cfg; diff --git a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc index 93d4a88383c33..70c1eb8bab253 100644 --- a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc +++ b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc @@ -23,6 +23,11 @@ namespace inference { TEST(TensorRT_fc, compare) { std::string model_dir = FLAGS_infer_model + "/fc_uint8"; + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir); + config.DisableGlogInfo(); + auto predictor = CreatePaddlePredictor(config); compare(model_dir, /* use_tensorrt */ true); // Open it when need. // profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc index 3b25c32fc7514..45c14f4fc8b37 100644 --- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc +++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc @@ -23,6 +23,11 @@ namespace inference { TEST(TensorRT_mobilenet, compare) { std::string model_dir = FLAGS_infer_model + "/mobilenet"; + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir); + config.DisableGlogInfo(); + auto predictor = CreatePaddlePredictor(config); compare(model_dir, /* use_tensorrt */ true); // Open it when need. // profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); diff --git a/paddle/fluid/inference/tests/api/trt_resnext_test.cc b/paddle/fluid/inference/tests/api/trt_resnext_test.cc index 374074957c870..8d4e331fa9730 100644 --- a/paddle/fluid/inference/tests/api/trt_resnext_test.cc +++ b/paddle/fluid/inference/tests/api/trt_resnext_test.cc @@ -23,6 +23,11 @@ namespace inference { TEST(TensorRT_resnext50, compare) { std::string model_dir = FLAGS_infer_model + "/resnext50"; + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir); + config.DisableGlogInfo(); + auto predictor = CreatePaddlePredictor(config); compare(model_dir, /* use_tensorrt */ true); } diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index cbc1faf94f07c..ead3e463c2803 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -39,10 +39,9 @@ enum class AllocationType : int8_t { const char* AllocationTypeStr(AllocationType type); -PADDLE_API size_t -GetOrRegisterGlobalDeviceTypeId(const std::string& device_type); +size_t GetOrRegisterGlobalDeviceTypeId(const std::string& device_type); -PADDLE_API std::string GetGlobalDeviceType(size_t device_type_id_); +std::string GetGlobalDeviceType(size_t device_type_id_); /// \brief The place is used to specify where the data is stored. class PADDLE_API Place { diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index d87915d172bb7..9680ec234b3b4 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -685,7 +685,8 @@ set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\inst %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^ %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;^ %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib;%THIRD_PARTY_PATH:/=\%\install\paddle2onnx\lib;^ -%work_dir%\%BUILD_DIR%\paddle\fluid\inference;%PATH% +%work_dir%\%BUILD_DIR%\paddle\fluid\inference;%work_dir%\%BUILD_DIR%\paddle\fluid\inference\capi_exp;^ +%PATH% REM TODO: make ut find .dll in install\onnxruntime\lib xcopy %THIRD_PARTY_PATH:/=\%\install\onnxruntime\lib\onnxruntime.dll %work_dir%\%BUILD_DIR%\paddle\fluid\inference\tests\api\ /Y From eb8fc7595ff04213021bfb322d8ecf987681cf41 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Fri, 1 Jul 2022 18:02:30 +0800 Subject: [PATCH 033/250] [DCU] fix compiling error on DTK22.04, test=develop (#43999) --- paddle/fluid/operators/class_center_sample_op.cu | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu index 236f783017dcc..a0642694843e8 100644 --- a/paddle/fluid/operators/class_center_sample_op.cu +++ b/paddle/fluid/operators/class_center_sample_op.cu @@ -220,6 +220,12 @@ class NotEqualToPreviousAdjacentIterator { return ret; } + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const { + self_type ret(arr_, offset_ - n); + return ret; + } + template __host__ __device__ __forceinline__ reference operator[](Distance n) const { return *(*this + n); From b4fef3974a2e0436a8423bc11a2558a76c104150 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Fri, 1 Jul 2022 19:06:16 +0800 Subject: [PATCH 034/250] template GetWeightCPUData (#44002) --- paddle/fluid/operators/tensorrt/tensorrt_engine_op.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index b1aa2b2c49ef6..b0ac285b5d38d 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -467,13 +467,6 @@ class TensorRTEngineOp : public framework::OperatorBase { auto stream = reinterpret_cast(dev_ctx).stream(); - PADDLE_ENFORCE_EQ( - runtime_input_names_.empty(), - false, - platform::errors::PreconditionNotMet( - "TensorRT engine needs at least one input, but no input is found. " - "Please check if you set the input correctly.")); - std::vector output_maps = Attr>("output_name_mapping"); From f1ffd59a8686cda32dae65aff259d489fff83da7 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 1 Jul 2022 19:35:28 +0800 Subject: [PATCH 035/250] add clip_extra and change use_combine_name (#44008) --- python/paddle/fluid/dygraph/jit.py | 9 ++++++--- .../paddle/fluid/tests/unittests/test_jit_save_load.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index 393f1c1570453..e8c263fe03355 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -379,7 +379,9 @@ def keep_name_table(self, value): def _parse_save_configs(configs): - supported_configs = ['output_spec', "with_hook", "use_combine"] + supported_configs = [ + 'output_spec', "with_hook", "combine_params", "clip_extra" + ] # input check for key in configs: @@ -392,7 +394,8 @@ def _parse_save_configs(configs): inner_config = _SaveLoadConfig() inner_config.output_spec = configs.get('output_spec', None) inner_config.with_hook = configs.get('with_hook', False) - inner_config.combine_params = configs.get("use_combine", False) + inner_config.combine_params = configs.get("combine_params", False) + inner_config.clip_extra = configs.get("clip_extra", False) return inner_config @@ -1015,7 +1018,7 @@ def fun(inputs): params_filename=params_filename, export_for_deployment=configs._export_for_deployment, program_only=configs._program_only, - clip_extra=False) + clip_extra=configs.clip_extra) # collect all vars for var in concrete_program.main_program.list_vars(): diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py index f467fbe4888e6..fd4129f47ff65 100644 --- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py @@ -1209,7 +1209,7 @@ def test_save_load_finetune_load(self): with unique_name.guard(): net = Net() #save - paddle.jit.save(net, model_path, use_combine=True) + paddle.jit.save(net, model_path, combine_params=True) class LayerLoadFinetune(paddle.nn.Layer): From 8d9f00a893d8f2bbc08f04c6789fdde0abb6af1c Mon Sep 17 00:00:00 2001 From: pangyoki Date: Fri, 1 Jul 2022 21:20:14 +0800 Subject: [PATCH 036/250] convert graph to program to let SandaloneExecutor supporrt CompiledProgram (#43448) * convert graph to program to let sSandaloneExecutor supporrt CompiledProgram * skip case that compiled_program._program is None * execute CompiledProgram._compile to apply build_strategy --- python/paddle/fluid/executor.py | 21 +++++++++++++++++++ .../interpreter/test_standalone_executor.py | 11 ++++++++++ 2 files changed, 32 insertions(+) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index d932b3f219bc2..78c3f413966e9 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -411,6 +411,18 @@ def _is_enable_standalone_executor(): return flag +def _is_standalone_executor_enable_compiled_program(): + """ + Whether to use experimental executor `StandaloneExecutor` in CompiledProgram. + Convert Graph to Program. + """ + flag = False + env_val = os.environ.get('FLAGS_CONVERT_GRAPH_TO_PROGRAM', None) + if env_val in [1, '1', True, 'True', 'true']: + flag = True + return flag + + def _prepare_fleet_executor(): from ..distributed.fleet.proto import fleet_executor_desc_pb2 trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS", "") @@ -1402,6 +1414,9 @@ def _can_use_interpreter_core(program, place): # print("compiled is : {}".format(compiled)) # NOTE(zhiqiu): do not support compiled program now if compiled: + if program._program is not None and _is_standalone_executor_enable_compiled_program( + ): + return True return False # if program._is_data_parallel and len( # program._get_places(place, program._places)) == 1: @@ -1438,6 +1453,12 @@ def _can_use_interpreter_core(program, place): # a little bit tricy here, use inner_program before _add_feed_fetch_ops to get key # while use program to geet _StandaloneExecutor if key not in self._executor_cache._cached_executors: + if isinstance(program, compiler.CompiledProgram): + program._compile(scope, self.place) + compiled_graph = program._graph + ir_graph = framework.IrGraph(compiled_graph, + for_test=True) + inner_program = ir_graph.to_program() program = self._add_feed_fetch_ops( program=inner_program, feed=feed, diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index 6fa419ae28228..f1b1bc118eb30 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -382,6 +382,17 @@ def test_compiled_program(self): for x, y in zip(gt, res): self.assertTrue(np.array_equal(x, y)) + def test_compiled_program_convert_graph_to_program(self): + data = np.ones([2, 2], dtype="float32") + feed = {"a": data} + + os.environ['FLAGS_CONVERT_GRAPH_TO_PROGRAM'] = '1' + res = self.run_new_executor(feed, use_compiled=True) + del os.environ['FLAGS_CONVERT_GRAPH_TO_PROGRAM'] + gt = self.run_raw_executor(feed, use_compiled=True) + for x, y in zip(gt, res): + self.assertTrue(np.array_equal(x, y)) + def test_empty_program(self): program = paddle.static.Program() exe = paddle.static.Executor(self.place) From 09096aebd49b1c07b8c0fea29206413f1ca938cc Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 1 Jul 2022 21:25:21 -0500 Subject: [PATCH 037/250] unify cpu context (#43989) * unify cpu context * fix init() * delete test_device_context * fix test_scalar --- .../elementwise/elementwise_add_op.cc | 6 - .../elementwise/elementwise_floordiv_op.cc | 3 - .../elementwise/elementwise_max_op.cc | 3 - .../elementwise/elementwise_min_op.cc | 3 - .../elementwise/elementwise_mod_op.cc | 3 - .../elementwise/elementwise_pow_op.cc | 3 - .../elementwise/elementwise_sub_op.cc | 3 - paddle/fluid/operators/increment_op.cc | 3 - paddle/fluid/operators/isfinite_op.cc | 3 - paddle/fluid/operators/isfinite_v2_op.cc | 3 - paddle/fluid/operators/label_smooth_op.cc | 3 - paddle/fluid/operators/math/beam_search.cc | 21 +- .../fluid/operators/math/concat_and_split.cc | 8 +- .../fluid/operators/math/context_project.cc | 11 +- .../fluid/operators/math/cos_sim_functor.cc | 7 - paddle/fluid/operators/math/cross_entropy.cc | 9 - paddle/fluid/operators/math/gru_compute.cc | 6 - paddle/fluid/operators/math/im2col.cc | 30 - paddle/fluid/operators/math/math_function.cc | 335 ---------- paddle/fluid/operators/math/maxouting.cc | 5 - paddle/fluid/operators/math/sample_prob.cc | 13 +- .../operators/math/selected_rows_functor.cc | 86 +-- .../fluid/operators/math/sequence_padding.cc | 113 ---- paddle/fluid/operators/math/sequence_scale.cc | 26 - paddle/fluid/operators/math/softmax.cc | 7 - paddle/fluid/operators/math/vol2col.cc | 250 ------- .../fluid/operators/mkldnn/sum_mkldnn_op.cc | 1 - paddle/fluid/operators/rank_loss_op.cc | 3 - .../operators/reduce_ops/frobenius_norm_op.cc | 3 - .../operators/reduce_ops/reduce_all_op.cc | 3 - .../operators/reduce_ops/reduce_any_op.cc | 3 - .../operators/reduce_ops/reduce_prod_op.cc | 3 - .../operators/reduce_ops/reduce_sum_op.cc | 3 - paddle/fluid/operators/set_value_op.cc | 3 - paddle/fluid/platform/device_context.cc | 8 - paddle/fluid/platform/device_context.h | 9 +- paddle/fluid/platform/transform.h | 24 - paddle/infrt/kernel/phi/context_kernels.cc | 1 - .../infershaped/infershape_launchers_test.cc | 1 - paddle/phi/backends/cpu/cpu_context.cc | 10 +- paddle/phi/backends/cpu/cpu_context.h | 6 - paddle/phi/kernels/funcs/blas/blas_impl.h | 616 ------------------ paddle/phi/kernels/funcs/fc_functor.cc | 2 - paddle/phi/kernels/funcs/for_range.h | 16 - paddle/phi/kernels/funcs/gru_compute.cc | 185 ------ paddle/phi/kernels/funcs/lstm_compute.cc | 80 --- paddle/phi/kernels/funcs/math_function.cc | 134 ++-- paddle/phi/kernels/funcs/matrix_inverse.cc | 4 - paddle/phi/tests/api/test_sparse_utils_api.cc | 1 - paddle/phi/tests/common/test_scalar.cu | 7 - paddle/phi/tests/core/CMakeLists.txt | 4 - paddle/phi/tests/core/test_device_context.cc | 54 -- paddle/phi/tests/kernels/test_cast_dev_api.cc | 1 - .../phi/tests/kernels/test_concat_dev_api.cc | 1 - paddle/phi/tests/kernels/test_conj_dev_api.cc | 1 - paddle/phi/tests/kernels/test_copy_dev_api.cc | 1 - .../tests/kernels/test_creation_dev_api.cc | 4 - paddle/phi/tests/kernels/test_dot_dev_api.cc | 1 - .../tests/kernels/test_elementwise_dev_api.cc | 4 - .../phi/tests/kernels/test_flatten_dev_api.cc | 1 - .../phi/tests/kernels/test_math_function.cc | 1 - .../phi/tests/kernels/test_matmul_dev_api.cc | 1 - paddle/phi/tests/kernels/test_mean_dev_api.cc | 1 - .../phi/tests/kernels/test_reshape_dev_api.cc | 1 - .../phi/tests/kernels/test_scale_dev_api.cc | 2 - .../kernels/test_sparse_activation_dev_api.cc | 1 - .../kernels/test_sparse_conv3d_dev_api.cc | 1 - .../test_sparse_elementwise_dev_api.cc | 4 - .../tests/kernels/test_sparse_pool_dev_api.cc | 1 - .../kernels/test_sparse_utils_dev_api.cc | 6 - .../phi/tests/kernels/test_split_dev_api.cc | 1 - paddle/phi/tests/kernels/test_sum_dev_api.cc | 1 - 72 files changed, 61 insertions(+), 2121 deletions(-) delete mode 100644 paddle/fluid/operators/math/math_function.cc delete mode 100644 paddle/phi/tests/core/test_device_context.cc diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index c71f6b7c3cd19..0123df0006f15 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -20,12 +20,6 @@ namespace paddle { namespace framework { class OpDesc; } // namespace framework -namespace imperative { -class OpBase; -} // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc index 422cbd881d28a..6a8c986a53c24 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc @@ -25,9 +25,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc index 58e9c6d7b4cb8..1911b5c2de6d7 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc @@ -23,9 +23,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc index 8b967cb1fe15e..9fd70754888bd 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc @@ -23,9 +23,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc index ee67f7e4020f1..55d6e214d6c12 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc @@ -25,9 +25,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc index c13fba99bdbab..fcfee9b4fca15 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc @@ -20,9 +20,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index a9968906fb90a..24f0228025f7f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -23,9 +23,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index 7d62bf2d628d0..3ab6b9f9405ed 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -25,9 +25,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index a7fc4865f78cb..bcab28df3a155 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -26,9 +26,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index 314bbf556aed6..65857b6d87db1 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -34,9 +34,6 @@ namespace operators { template class OverflowKernel; } // namespace operators -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc index ccd4db816bdce..873ab62a3d246 100644 --- a/paddle/fluid/operators/label_smooth_op.cc +++ b/paddle/fluid/operators/label_smooth_op.cc @@ -24,9 +24,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc index cda085aabe99b..2b607ade728c4 100644 --- a/paddle/fluid/operators/math/beam_search.cc +++ b/paddle/fluid/operators/math/beam_search.cc @@ -13,26 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/beam_search.h" - +#include "paddle/phi/backends/cpu/cpu_context.h" namespace phi { class DenseTensor; } // namespace phi -namespace paddle { -namespace framework {} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { namespace math { template -class BeamSearchFunctor { +class BeamSearchFunctor { public: - void operator()(const platform::CPUDeviceContext &context, + void operator()(const phi::CPUContext &context, const framework::LoDTensor *pre_ids, const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids, @@ -308,10 +301,10 @@ class BeamSearchFunctor { } }; -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; -template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; +template class BeamSearchFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 4ce2db1e579db..3df69e200190f 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #ifdef PADDLE_WITH_ASCEND_CL @@ -28,13 +29,6 @@ namespace phi { class DenseTensor; } // namespace phi -namespace paddle { -namespace framework {} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/context_project.cc b/paddle/fluid/operators/math/context_project.cc index 927d610e2ce47..beee93ae0166c 100644 --- a/paddle/fluid/operators/math/context_project.cc +++ b/paddle/fluid/operators/math/context_project.cc @@ -13,19 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/context_project.h" - -namespace paddle { -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle +#include "paddle/phi/backends/cpu/cpu_context.h" namespace paddle { namespace operators { namespace math { -template class ContextProjectFunctor; -template class ContextProjectFunctor; +template class ContextProjectFunctor; +template class ContextProjectFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc index 4a3da2ef86d37..85f012afb505a 100644 --- a/paddle/fluid/operators/math/cos_sim_functor.cc +++ b/paddle/fluid/operators/math/cos_sim_functor.cc @@ -14,16 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/cos_sim_functor.h" -namespace paddle { -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { namespace math { - template struct CosSimDyFunctor { void operator()(const platform::CPUDeviceContext& ctx, diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc index 45c7e47b810ac..17ff6aff6f93d 100644 --- a/paddle/fluid/operators/math/cross_entropy.cc +++ b/paddle/fluid/operators/math/cross_entropy.cc @@ -17,12 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/backends/cpu/cpu_context.h" -namespace paddle { -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { namespace math { @@ -129,9 +123,6 @@ void CrossEntropyFunctor::operator()( } } -template class CrossEntropyFunctor; -template class CrossEntropyFunctor; - template class CrossEntropyFunctor; template class CrossEntropyFunctor; } // namespace math diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc index d8fa1b5a869b1..7e543a63afc9e 100644 --- a/paddle/fluid/operators/math/gru_compute.cc +++ b/paddle/fluid/operators/math/gru_compute.cc @@ -15,12 +15,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" -namespace paddle { -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { namespace math { diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index e7ed2cbf67563..9192badedcfff 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -16,12 +16,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col_cfo_cpu.h" -namespace paddle { -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - namespace phi { class CPUContext; } // namespace phi @@ -166,24 +160,12 @@ class Col2ImFunctor; -template class Im2ColFunctor; template class Im2ColFunctor; template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; template class Col2ImFunctor; @@ -353,24 +335,12 @@ class Col2ImFunctor; -template class Im2ColFunctor; template class Im2ColFunctor; template class Im2ColFunctor; -template class Col2ImFunctor; -template class Col2ImFunctor; template class Col2ImFunctor; diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc deleted file mode 100644 index 5eff0a5d4575b..0000000000000 --- a/paddle/fluid/operators/math/math_function.cc +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/math_function.h" - -#ifdef PADDLE_WITH_MKLML -#include "paddle/fluid/platform/dynload/mklml.h" -#endif - -#ifdef PADDLE_USE_OPENBLAS -#include -#endif - -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/math/math_function_impl.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "unsupported/Eigen/CXX11/Tensor" - -namespace paddle { -namespace operators { -namespace math { - -using float16 = paddle::platform::float16; - -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant>; -template struct SetConstant>; - -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant>; -template struct SetConstant>; - -#ifdef PADDLE_WITH_XPU -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant>; -template struct SetConstant>; -#endif - -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose, \ - RANK>; \ - template struct Transpose, \ - RANK>; - -DEFINE_CPU_TRANS(1); -DEFINE_CPU_TRANS(2); -DEFINE_CPU_TRANS(3); -DEFINE_CPU_TRANS(4); -DEFINE_CPU_TRANS(5); -DEFINE_CPU_TRANS(6); - -template -struct TransposeNormal { - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& in, - framework::Tensor* out, - const std::vector& axis) { - const int rank = axis.size(); - auto in_stride = phi::stride(in.dims()); - auto out_stride = phi::stride(out->dims()); - const T* in_ptr = in.data(); - T* out_ptr = out->data(); - - auto transpose_helper = [&](int64_t beg, int64_t end) { - for (int64_t out_idx = beg; out_idx < end; ++out_idx) { - int64_t in_idx = 0; - int64_t tmp_idx = out_idx; - // calculate the input index - for (int i = 0; i < rank; ++i) { - const int64_t coordinate = tmp_idx / out_stride[i]; - tmp_idx -= coordinate * out_stride[i]; - in_idx += coordinate * in_stride[axis[i]]; - } - out_ptr[out_idx] = in_ptr[in_idx]; - } - }; - transpose_helper(0, out->numel()); - } -}; - -// define transpose normal -#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal - -DEFINE_CPU_TRANS_NORMAL(platform::float16); -DEFINE_CPU_TRANS_NORMAL(platform::bfloat16); -DEFINE_CPU_TRANS_NORMAL(float); -DEFINE_CPU_TRANS_NORMAL(double); -DEFINE_CPU_TRANS_NORMAL(int); -DEFINE_CPU_TRANS_NORMAL(int64_t); -DEFINE_CPU_TRANS_NORMAL(bool); -DEFINE_CPU_TRANS_NORMAL(int16_t); -DEFINE_CPU_TRANS_NORMAL(uint8_t); -DEFINE_CPU_TRANS_NORMAL(int8_t); -DEFINE_CPU_TRANS_NORMAL(platform::complex); -DEFINE_CPU_TRANS_NORMAL(platform::complex); - -struct TensorSetConstantCPU { - TensorSetConstantCPU(framework::Tensor* tensor, float value) - : tensor_(tensor), value_(value) {} - template - void apply() const { - auto cpu = platform::CPUPlace(); - auto* begin = tensor_->mutable_data(cpu); - std::fill(begin, begin + tensor_->numel(), static_cast(value_)); - } - framework::Tensor* tensor_; - float value_; -}; - -template <> -void set_constant_with_place( - const platform::DeviceContext& context, - framework::Tensor* tensor, - float value) { - PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported")); -} - -template <> -void set_constant_with_place( - const platform::DeviceContext& context, - framework::Tensor* tensor, - float value) { - PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported")); -} - -template <> -void set_constant_with_place( - const platform::DeviceContext& context, - framework::Tensor* tensor, - float value) { - PADDLE_THROW( - platform::errors::Unimplemented("NPUPinnedPlace is not supported")); -} - -template <> -void set_constant_with_place( - const platform::DeviceContext& context, - framework::Tensor* tensor, - float value) { - PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported")); -} - -template <> -void set_constant_with_place( - const platform::DeviceContext& context, - framework::Tensor* tensor, - float value) { - framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); -} - -template <> -void set_constant_with_place( - const platform::DeviceContext& context, - framework::Tensor* tensor, - float value) { - PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported")); -} - -template <> -void set_constant_with_place( - const platform::DeviceContext& context, - framework::Tensor* tensor, - float value) { - PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported")); -} - -template <> -void set_constant_with_place( - const platform::DeviceContext& context, - framework::Tensor* tensor, - float value) { - framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value)); -} - -struct TensorSetConstantWithPlace : public boost::static_visitor { - TensorSetConstantWithPlace(const platform::DeviceContext& context, - framework::Tensor* tensor, - float value) - : context_(context), tensor_(tensor), value_(value) {} - - template - void operator()(Place place) const { - set_constant_with_place(context_, tensor_, value_); - } - - const platform::DeviceContext& context_; - framework::Tensor* tensor_; - float value_; -}; - -void set_constant(const platform::DeviceContext& context, - framework::Tensor* tensor, - float value) { - TensorSetConstantWithPlace func(context, tensor, value); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - // tensor->place().apply_visitor(func); - paddle::platform::VisitPlace(tensor->place(), func); -#else - func(platform::CPUPlace()); -#endif -} - -template -struct RowwiseAdd { - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& vector, - framework::Tensor* output) { - auto in_dims = input.dims(); - auto out_dims = output->dims(); - auto size = input.numel() / in_dims[0]; - PADDLE_ENFORCE_EQ( - vector.numel(), - size, - platform::errors::InvalidArgument( - "The input vector size" - " should be equal to the size of each row of input tensor." - " Expected vector size=%d, but received %d", - size, - vector.numel())); - const char* in_dims_cstr = in_dims.to_str().c_str(); - const char* out_dims_cstr = out_dims.to_str().c_str(); - PADDLE_ENFORCE_EQ(out_dims, - in_dims, - platform::errors::InvalidArgument( - "The output tensor shape should be same as the input" - " tensor shape. Expected output tensor shape: %s," - " but received %s", - in_dims_cstr, - out_dims_cstr)); - - auto in = framework::EigenMatrix::From(input); - auto vec = framework::EigenVector::Flatten(vector); - auto out = framework::EigenMatrix::From(*output); - - for (int64_t i = 0; i < in_dims[0]; ++i) { - out.chip(i, 0) = in.chip(i, 0) + vec; - } - } -}; - -template struct RowwiseAdd; -template struct RowwiseAdd; - -template struct ColwiseSum; -template struct ColwiseSum; -template struct ColwiseSum; -template struct ColwiseSum; - -template struct RowwiseSum; -template struct RowwiseSum; - -template struct RowwiseMean; -template struct RowwiseMean; - -template -struct ElementwiseAddTo { - void operator()(platform::CPUDeviceContext* ctx, - const framework::Tensor& src, - framework::Tensor* dst) { - auto in = framework::EigenVector::Flatten(src); - auto out = framework::EigenVector::Flatten(*dst); - auto& place = *(ctx->eigen_device()); - out.device(place) = out + in; - } -}; - -template struct ElementwiseAddTo; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc index 7729b86cc3e0b..2205ed51e1913 100644 --- a/paddle/fluid/operators/math/maxouting.cc +++ b/paddle/fluid/operators/math/maxouting.cc @@ -109,11 +109,6 @@ void MaxOutGradFunctor::operator()( } } -template class MaxOutGradFunctor; -template class MaxOutGradFunctor; -template class MaxOutFunctor; -template class MaxOutFunctor; - template class MaxOutGradFunctor; template class MaxOutGradFunctor; template class MaxOutFunctor; diff --git a/paddle/fluid/operators/math/sample_prob.cc b/paddle/fluid/operators/math/sample_prob.cc index 16342493e4597..18321cf9b9ece 100644 --- a/paddle/fluid/operators/math/sample_prob.cc +++ b/paddle/fluid/operators/math/sample_prob.cc @@ -14,19 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/sample_prob.h" -namespace paddle { -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { -namespace math { - -template class SampleWithProb; -template class SampleWithProb; - -} // namespace math +namespace math {} // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 81b0e9102bbac..399a1b6dc4ccd 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -276,51 +276,6 @@ struct SelectedRowsSumTo { template struct SelectedRowsSumTo; template struct SelectedRowsSumTo; -template -struct SelectedRowsAddToTensor { - void operator()(const platform::CPUDeviceContext& context, - const phi::SelectedRows& input1, - framework::Tensor* input2) { - if (UNLIKELY(input1.rows().size() == 0)) { - LOG(WARNING) << "input selected rows is empty!"; - return; - } - auto in1_height = input1.height(); - const auto& in2_dims = input2->dims(); - PADDLE_ENFORCE_EQ( - in1_height, - in2_dims[0], - platform::errors::InvalidArgument("The two inputs height must be equal." - "But received first input height = " - "[%d], second input height = [%d]", - in1_height, - in2_dims[0])); - - auto& in1_value = input1.value(); - auto& in1_rows = input1.rows(); - - int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); - PADDLE_ENFORCE_EQ( - in1_row_numel, - input2->numel() / in1_height, - platform::errors::InvalidArgument( - "The two inputs width must be equal." - "But received first input width = [%d], second input width = [%d]", - in1_row_numel, - input2->numel() / in1_height)); - - auto* in1_data = in1_value.data(); - auto* input2_data = input2->data(); - - for (size_t i = 0; i < in1_rows.size(); i++) { - for (int64_t j = 0; j < in1_row_numel; j++) { - input2_data[in1_rows[i] * in1_row_numel + j] += - in1_data[i * in1_row_numel + j]; - } - } - } -}; - template struct SelectedRowsAddToTensor { void operator()(const phi::CPUContext& context, @@ -366,13 +321,6 @@ struct SelectedRowsAddToTensor { } }; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; - template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; @@ -582,34 +530,6 @@ struct MergeAddImpl { } }; -template -struct MergeAdd { - // unary functor, merge by adding duplicated rows in - // the input SelectedRows object. - phi::SelectedRows operator()(const platform::CPUDeviceContext& context, - const phi::SelectedRows& input, - const bool sorted_result) { - return MergeAddImpl()( - context, input, sorted_result); - } - - void operator()(const platform::CPUDeviceContext& context, - const phi::SelectedRows& input, - phi::SelectedRows* output, - const bool sorted_result) { - MergeAddImpl()( - context, input, output, sorted_result); - } - - void operator()(const platform::CPUDeviceContext& context, - const std::vector& inputs, - phi::SelectedRows* output, - const bool sorted_result) { - MergeAddImpl()( - context, inputs, output, sorted_result); - } -}; - template struct MergeAdd { // unary functor, merge by adding duplicated rows in @@ -635,10 +555,8 @@ struct MergeAdd { } }; -#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype) \ - template struct MergeAddImpl; \ - template struct MergeAddImpl; \ - template struct MergeAdd; \ +#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype) \ + template struct MergeAddImpl; \ template struct MergeAdd; TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(float) diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc index 1a952bbb62d52..826eda5559a46 100644 --- a/paddle/fluid/operators/math/sequence_padding.cc +++ b/paddle/fluid/operators/math/sequence_padding.cc @@ -20,13 +20,6 @@ namespace phi { class DenseTensor; } // namespace phi -namespace paddle { -namespace framework {} // namespace framework -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { namespace math { @@ -101,66 +94,6 @@ static void fast_mem_init(void* dest, } } -template -class PaddingLoDTensorFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::LoDTensor& seq_tensor, - framework::LoDTensor* pad_tensor, - const framework::LoDTensor& pad_value, - int pad_seq_len = -1, - int lod_level = 0, - bool norm_by_times = false, - const PadLayout layout = kBatchLengthWidth) { - auto seq_lod = seq_tensor.lod(); - const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; - const auto& seq_tensor_dims = seq_tensor.dims(); - const auto& pad_tensor_dims = pad_tensor->dims(); - if (pad_seq_len == -1) { - pad_seq_len = MaximumSequenceLength(seq_offsets); - } - int step_width = seq_tensor.numel() / seq_tensor_dims[0]; - - CheckDims(seq_tensor_dims, - pad_tensor_dims, - seq_offsets, - pad_seq_len, - step_width, - layout); - - PADDLE_ENFORCE_EQ( - pad_value.numel() == 1 || pad_value.numel() == step_width, - true, - platform::errors::InvalidArgument( - "The numel of 'pad_value' can only be 1 or be equal to the " - "'step_width', but got %ld != 1 and %ld. Please check the input " - "value.", - pad_value.numel(), - step_width)); - - // fill padding value - T* pad_data = pad_tensor->data(); - const T* pad_value_data = pad_value.data(); - if (pad_value.numel() == 1) { - fast_mem_init( - pad_data, pad_tensor->numel(), pad_value_data, sizeof(T)); - } else { - for (int i = 0; i < pad_tensor->numel(); i += step_width) { - memcpy(pad_data + i, pad_value_data, step_width * sizeof(T)); - } - } - - CopyValidData(pad_tensor, - &seq_tensor, - seq_offsets, - pad_seq_len, - step_width, - norm_by_times, - kSeqToPad, - layout); - } -}; - template class PaddingLoDTensorFunctor { public: @@ -221,42 +154,6 @@ class PaddingLoDTensorFunctor { } }; -template -class UnpaddingLoDTensorFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::LoDTensor& pad_tensor, - framework::LoDTensor* seq_tensor, - int pad_seq_len = -1, - int lod_level = 0, - bool norm_by_times = false, - const PadLayout layout = kBatchLengthWidth) { - auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level]; - const auto& seq_tensor_dims = seq_tensor->dims(); - const auto& pad_tensor_dims = pad_tensor.dims(); - if (pad_seq_len == -1) { - pad_seq_len = MaximumSequenceLength(seq_offsets); - } - int step_width = seq_tensor->numel() / seq_tensor_dims[0]; - - CheckDims(seq_tensor_dims, - pad_tensor_dims, - seq_offsets, - pad_seq_len, - step_width, - layout); - - CopyValidData(seq_tensor, - &pad_tensor, - seq_offsets, - pad_seq_len, - step_width, - norm_by_times, - kPadToSeq, - layout); - } -}; - template class UnpaddingLoDTensorFunctor { public: @@ -293,16 +190,6 @@ class UnpaddingLoDTensorFunctor { } }; -template class PaddingLoDTensorFunctor; -template class PaddingLoDTensorFunctor; -template class PaddingLoDTensorFunctor; -template class PaddingLoDTensorFunctor; - -template class UnpaddingLoDTensorFunctor; -template class UnpaddingLoDTensorFunctor; -template class UnpaddingLoDTensorFunctor; -template class UnpaddingLoDTensorFunctor; - template class PaddingLoDTensorFunctor; template class PaddingLoDTensorFunctor; template class PaddingLoDTensorFunctor; diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc index cd91b2eb53149..8faf9572bef0d 100644 --- a/paddle/fluid/operators/math/sequence_scale.cc +++ b/paddle/fluid/operators/math/sequence_scale.cc @@ -24,29 +24,6 @@ namespace paddle { namespace operators { namespace math { -template -class ScaleLoDTensorFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const T* scales, - framework::LoDTensor* seq) { - const size_t level = 0; - auto lod = seq->lod(); - const size_t num_seq = lod[level].size() - 1; - size_t seq_width = seq->dims()[1]; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); - - T* seq_data = seq->mutable_data(context.GetPlace()); - for (size_t i = 0; i < num_seq; ++i) { - for (size_t j = lod[level][i] * seq_width; - j < lod[level][i + 1] * seq_width; - ++j) { - seq_data[j] *= scales[i]; - } - } - } -}; - template class ScaleLoDTensorFunctor { public: @@ -70,9 +47,6 @@ class ScaleLoDTensorFunctor { } }; -template class ScaleLoDTensorFunctor; -template class ScaleLoDTensorFunctor; - template class ScaleLoDTensorFunctor; template class ScaleLoDTensorFunctor; diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc index adea86a6c5a87..730dcbf59a605 100644 --- a/paddle/fluid/operators/math/softmax.cc +++ b/paddle/fluid/operators/math/softmax.cc @@ -21,13 +21,6 @@ namespace paddle { namespace operators { namespace math { -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; - template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc index 7b687909306c0..36ce3e6474254 100644 --- a/paddle/fluid/operators/math/vol2col.cc +++ b/paddle/fluid/operators/math/vol2col.cc @@ -16,12 +16,6 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" -namespace paddle { -namespace platform { -class CPUDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { namespace math { @@ -32,126 +26,6 @@ namespace math { * [input_channels, filter_depth, filter_height, filter_width, * output_depth, output_height, output_width] */ -template -class Vol2ColFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& vol, - const std::vector& dilations, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* col, - const DataLayout data_layout) const { - PADDLE_ENFORCE_EQ(vol.dims().size(), - 4, - platform::errors::InvalidArgument( - "The dimension of vol should be 4, but received %d.", - vol.dims().size())); - - PADDLE_ENFORCE_EQ(col->dims().size(), - 7, - platform::errors::InvalidArgument( - "The dimension of col should be 7, but received %d.", - col->dims().size())); - - int input_channels = - (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); - int input_depth = - (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]); - int input_height = - (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]); - int input_width = - (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]); - int filter_depth = col->dims()[1]; - int filter_height = col->dims()[2]; - int filter_width = col->dims()[3]; - int output_depth = col->dims()[4]; - int output_height = col->dims()[5]; - int output_width = col->dims()[6]; - int channels_col = - input_channels * filter_depth * filter_height * filter_width; - - // changed - bool paddings_size_is_6 = (paddings.size() == 6); - int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; - int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; - int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; - int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; - int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; - int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; - - auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1; - PADDLE_ENFORCE_EQ( - input_depth_tmp, - output_depth, - platform::errors::InvalidArgument( - "input_depth(%d) and output_depth(%d) are mismatching.", - input_depth_tmp, - output_depth)); - auto input_height_tmp = (input_height + pad_h_up + pad_h_down - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1; - PADDLE_ENFORCE_EQ( - input_height_tmp, - output_height, - platform::errors::InvalidArgument( - "input_height(%d) and output_height(%d) are mismatching.", - input_height_tmp, - output_height)); - auto input_width_tmp = (input_width + pad_w_left + pad_w_right - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1; - PADDLE_ENFORCE_EQ( - input_width_tmp, - output_width, - platform::errors::InvalidArgument( - "input_width(%d) and output_width(%d) are mismatching.", - input_width_tmp, - output_width)); - const T* vol_data = vol.data(); - T* col_data = col->data(); - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int d_offset = (c / filter_width / filter_height) % filter_depth; - int c_in = c / filter_width / filter_height / filter_depth; - for (int d = 0; d < output_depth; ++d) { - int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; - for (int h = 0; h < output_height; ++h) { - int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; - for (int w = 0; w < output_width; ++w) { - int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; - - int col_idx = - ((c * output_depth + d) * output_height + h) * output_width + w; - int vol_idx; - if (data_layout != DataLayout::kNHWC) { - vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) * - input_width + - w_pad; - } else { - vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) * - input_channels + - c_in; - } - col_data[col_idx] = - (h_pad < 0 || h_pad >= input_height || w_pad < 0 || - w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) - ? static_cast(0) - : vol_data[vol_idx]; - } - } - } - } - } -}; - template class Vol2ColFunctor { public: @@ -278,126 +152,6 @@ class Vol2ColFunctor { * [input_channels, filter_depth, filter_height, filter_width, * output_depth, output_height, output_width] */ -template -class Col2VolFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& col, - const std::vector& dilations, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* vol, - const DataLayout data_layout) const { - PADDLE_ENFORCE_EQ(vol->dims().size(), - 4, - platform::errors::InvalidArgument( - "The dimension of vol should be 4, but received %d.", - vol->dims().size())); - - PADDLE_ENFORCE_EQ(col.dims().size(), - 7, - platform::errors::InvalidArgument( - "The dimension of col should be 7, but received %d.", - col.dims().size())); - - int input_channels = - (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]); - int input_depth = - (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]); - int input_height = - (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]); - int input_width = - (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]); - int filter_depth = col.dims()[1]; - int filter_height = col.dims()[2]; - int filter_width = col.dims()[3]; - int output_depth = col.dims()[4]; - int output_height = col.dims()[5]; - int output_width = col.dims()[6]; - int channels_col = - input_channels * filter_depth * filter_height * filter_width; - - bool paddings_size_is_6 = (paddings.size() == 6); - int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; - int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; - int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; - int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; - int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; - int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; - - auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - - ((dilations[0] * (filter_depth - 1) + 1))) / - strides[0] + - 1; - PADDLE_ENFORCE_EQ( - input_depth_tmp, - output_depth, - platform::errors::InvalidArgument( - "input_depth(%d) and output_depth(%d) are mismatching.", - input_depth_tmp, - output_depth)); - auto input_height_tmp = (input_height + pad_h_up + pad_h_down - - ((dilations[1] * (filter_height - 1) + 1))) / - strides[1] + - 1; - PADDLE_ENFORCE_EQ( - input_height_tmp, - output_height, - platform::errors::InvalidArgument( - "input_height(%d) and output_height(%d) are mismatching.", - input_height_tmp, - output_height)); - auto input_width_tmp = (input_width + pad_w_left + pad_w_right - - ((dilations[2] * (filter_width - 1) + 1))) / - strides[2] + - 1; - PADDLE_ENFORCE_EQ( - input_width_tmp, - output_width, - platform::errors::InvalidArgument( - "input_width(%d) and output_width(%d) are mismatching.", - input_width_tmp, - output_width)); - T* vol_data = vol->data(); - const T* col_data = col.data(); - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int d_offset = (c / filter_width / filter_height) % filter_depth; - int cIm = c / filter_width / filter_height / filter_depth; - for (int d = 0; d < output_depth; ++d) { - int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; - for (int h = 0; h < output_height; ++h) { - int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; - for (int w = 0; w < output_width; ++w) { - int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; - - if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && - w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { - int vol_idx; - if (data_layout != DataLayout::kNHWC) { - vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) * - input_width + - w_pad; - } else { - vol_idx = - ((d_pad * input_height + h_pad) * input_width + w_pad) * - input_channels + - cIm; - } - int col_idx = - ((c * output_depth + d) * output_height + h) * output_width + - w; - vol_data[vol_idx] += col_data[col_idx]; - } - } - } - } - } - } -}; - template class Col2VolFunctor { public: @@ -518,13 +272,9 @@ class Col2VolFunctor { } }; -template class Vol2ColFunctor; -template class Vol2ColFunctor; template class Vol2ColFunctor; template class Vol2ColFunctor; -template class Col2VolFunctor; -template class Col2VolFunctor; template class Col2VolFunctor; template class Col2VolFunctor; diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index a92d9ec2f2b4b..bd6d55fb7b3fa 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -34,7 +34,6 @@ class DenseTensor; namespace paddle { namespace framework {} // namespace framework namespace platform { -class CPUDeviceContext; class MKLDNNDeviceContext; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index 05f2fb7067e51..49d6424394ab7 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -24,9 +24,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc index c7b0e8ced59e7..7fba45fa53923 100644 --- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc +++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc @@ -27,9 +27,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc index 6947ca5b71a93..f0de94666357e 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc @@ -27,9 +27,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle DECLARE_INFER_SHAPE_FUNCTOR(reduce_all, diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc index 85e262add2e74..6634ccaaa0121 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc @@ -26,9 +26,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle DECLARE_INFER_SHAPE_FUNCTOR(reduce_any, diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc index 1c88c4cb70842..578954663c7f5 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc @@ -25,9 +25,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index ca24cc9c634a3..d072dcfa5eb94 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -27,9 +27,6 @@ class OpDesc; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index da2cf4c0dbe14..074642e1b0241 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -31,9 +31,6 @@ class EmptyGradOpMaker; namespace imperative { class OpBase; } // namespace imperative -namespace platform { -class CPUDeviceContext; -} // namespace platform } // namespace paddle namespace paddle { diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 3ad22def69039..ec7f46cd973d4 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -367,14 +367,6 @@ DeviceContextPool::DeviceContextPool( /*disable_setting_default_stream_for_allocator=*/false); } -CPUDeviceContext::CPUDeviceContext() : phi::CPUContext() { - phi::CPUContext::Init(); -} - -CPUDeviceContext::CPUDeviceContext(CPUPlace place) : phi::CPUContext(place) { - phi::CPUContext::Init(); -} - #ifdef PADDLE_WITH_IPU IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {} diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index c6cc29d9ca1c8..2c3bc017635dd 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -134,14 +134,7 @@ constexpr DeviceType kMLU = DeviceType::MLU; using DeviceContext = phi::DeviceContext; -// using CPUDeviceContext = phi::CPUContext; -// TODO(wilber): The place constructor is used in many places, it is more -// difficult to use CPUDeviceContext = phi::CPUContext directly. -class CPUDeviceContext : public phi::CPUContext { - public: - CPUDeviceContext(); - explicit CPUDeviceContext(CPUPlace place); -}; +using CPUDeviceContext = phi::CPUContext; template struct DefaultDeviceContextType; diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h index 45756372e2291..575415ef89023 100644 --- a/paddle/fluid/platform/transform.h +++ b/paddle/fluid/platform/transform.h @@ -69,30 +69,6 @@ struct Transform { }; // NOTE: After the phi kernel is migrated, it needs to be deleted. -template <> -struct Transform { - template - void operator()(const platform::CPUDeviceContext& context, - InputIter first, - InputIter last, - OutputIter result, - UnaryOperation op) { - std::transform(first, last, result, op); - } - - template - void operator()(const platform::CPUDeviceContext& context, - InputIter1 first1, - InputIter1 last1, - InputIter2 first2, - OutputIter result, - BinaryOperation op) { - std::transform(first1, last1, first2, result, op); - } -}; template <> struct Transform { diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc index 23d96aeb8d5e5..9c5ab13d17b52 100644 --- a/paddle/infrt/kernel/phi/context_kernels.cc +++ b/paddle/infrt/kernel/phi/context_kernels.cc @@ -20,7 +20,6 @@ namespace phi { ::phi::CPUContext CreateCPUContext() { ::phi::CPUContext ctx{}; - ctx.Init(); auto allocator = new backends::CpuPhiAllocator{}; ctx.SetAllocator(allocator); ctx.SetHostAllocator(allocator); diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc index 5a314817c2420..aa577da60c3ae 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc @@ -81,7 +81,6 @@ TEST(ElementwiseAdd, launcher_registry) { ::phi::CPUContext context; context.SetAllocator(alloc); - context.Init(); host_context::KernelFrameBuilder kernel_frame_builder; kernel_frame_builder.AddArgument(new host_context::Value(std::move(context))); diff --git a/paddle/phi/backends/cpu/cpu_context.cc b/paddle/phi/backends/cpu/cpu_context.cc index 42e19944b210b..63b5d82f3bdd0 100644 --- a/paddle/phi/backends/cpu/cpu_context.cc +++ b/paddle/phi/backends/cpu/cpu_context.cc @@ -51,10 +51,14 @@ struct CPUContext::Impl { }; CPUContext::CPUContext() - : DeviceContext(), impl_(std::make_unique()) {} + : DeviceContext(), impl_(std::make_unique()) { + impl_->Init(); +} CPUContext::CPUContext(const Place& place) - : DeviceContext(), impl_(std::make_unique(place)) {} + : DeviceContext(), impl_(std::make_unique(place)) { + impl_->Init(); +} CPUContext::~CPUContext() = default; @@ -62,8 +66,6 @@ CPUContext::CPUContext(CPUContext&&) = default; CPUContext& CPUContext::operator=(CPUContext&&) = default; -void CPUContext::Init() { impl_->Init(); } - Eigen::DefaultDevice* CPUContext::eigen_device() const { return impl_->GetEigenDevice(); } diff --git a/paddle/phi/backends/cpu/cpu_context.h b/paddle/phi/backends/cpu/cpu_context.h index e482fdc9e042f..58548b2e04e02 100644 --- a/paddle/phi/backends/cpu/cpu_context.h +++ b/paddle/phi/backends/cpu/cpu_context.h @@ -34,12 +34,6 @@ class PADDLE_API CPUContext : public DeviceContext { Eigen::DefaultDevice* eigen_device() const; const Place& GetPlace() const override; - public: - // NOTE: DeviceContext hold resources. Used in training scenarios. - // The interface used by the training scene, DeviceContext will initialize - // all resources and delete them when destructing. - void Init(); - protected: // NOTE: External users manage resources. Used in inference scenarios. // The Set interface is for inference only, DeviceContext will mark the diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index db4796b3f61ca..a18ec953d0abd 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -1003,12 +1003,6 @@ struct CBlas { #ifdef PADDLE_WITH_MKLML template <> template -T *Blas::GEMM_ALLOC( - const CBLAS_IDENTIFIER id, const int M, const int N, const int K) const { - return CBlas::GEMM_ALLOC(id, M, N, K); -} -template <> -template T *Blas::GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N, @@ -1016,20 +1010,6 @@ T *Blas::GEMM_ALLOC(const CBLAS_IDENTIFIER id, return CBlas::GEMM_ALLOC(id, M, N, K); } -template <> -template -void Blas::GEMM_PACK( - const CBLAS_IDENTIFIER id, - const CBLAS_TRANSPOSE trans, - int M, - int N, - int K, - const T alpha, - const T *src, - const int ld, - T *dst) const { - CBlas::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst); -} template <> template void Blas::GEMM_PACK(const CBLAS_IDENTIFIER id, @@ -1044,24 +1024,6 @@ void Blas::GEMM_PACK(const CBLAS_IDENTIFIER id, CBlas::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst); } -template <> -template -void Blas::GEMM_COMPUTE( - int transA, - int transB, - int M, - int N, - int K, - const T *A, - const int lda, - const T *B, - const int ldb, - T beta, - T *C, - const int ldc) const { - CBlas::GEMM_COMPUTE( - CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc); -} template <> template void Blas::GEMM_COMPUTE(int transA, @@ -1080,11 +1042,6 @@ void Blas::GEMM_COMPUTE(int transA, CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc); } -template <> -template -void Blas::GEMM_FREE(T *data) const { - CBlas::GEMM_FREE(data); -} template <> template void Blas::GEMM_FREE(T *data) const { @@ -1092,36 +1049,6 @@ void Blas::GEMM_FREE(T *data) const { } #endif -template <> -template -void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - CBlas::GEMM(CblasRowMajor, - transA, - transB, - M, - N, - K, - alpha, - A, - lda, - B, - ldb, - beta, - C, - ldc); -} template <> template void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -1153,36 +1080,6 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, ldc); } -template <> -template -void Blas::GEMM(bool transA, - bool transB, - int M, - int N, - int K, - T alpha, - const T *A, - int lda, - const T *B, - int ldb, - T beta, - T *C, - int ldc) const { - CBlas::GEMM(CblasRowMajor, - transA == false ? CblasNoTrans : CblasTrans, - transB == false ? CblasNoTrans : CblasTrans, - M, - N, - K, - alpha, - A, - lda, - B, - ldb, - beta, - C, - ldc); -} template <> template void Blas::GEMM(bool transA, @@ -1214,36 +1111,6 @@ void Blas::GEMM(bool transA, ldc); } -template <> -template -void Blas::GEMM(CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - int lda, - const T *B, - int ldb, - T beta, - T *C, - int ldc) const { - CBlas::GEMM(CblasRowMajor, - transA, - transB, - M, - N, - K, - alpha, - A, - lda, - B, - ldb, - beta, - C, - ldc); -} template <> template void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -1323,50 +1190,18 @@ void Blas::MatMul(const phi::DenseTensor &mat_a, mat_out->data()); } -template <> -template -void Blas::AXPY(int n, - T alpha, - const T *x, - T *y) const { - CBlas::AXPY(n, alpha, x, 1, y, 1); -} template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { CBlas::AXPY(n, alpha, x, 1, y, 1); } -template <> -template -void Blas::VCOPY(int n, - const T *x, - T *y) const { - CBlas::VCOPY(n, x, 1, y, 1); -} template <> template void Blas::VCOPY(int n, const T *x, T *y) const { CBlas::VCOPY(n, x, 1, y, 1); } -template <> -template -void Blas::VADD(int n, - const T *x, - const T *y, - T *z) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VADD(n, x, y, z); -#else - if (x == z) { - this->template AXPY(n, (T)(1.), y, z); - } else { - this->template VCOPY(n, y, z); - this->template AXPY(n, (T)(1.), x, z); - } -#endif -} template <> template void Blas::VADD(int n, const T *x, const T *y, T *z) const { @@ -1382,21 +1217,6 @@ void Blas::VADD(int n, const T *x, const T *y, T *z) const { #endif } -template <> -template -void Blas::VSUB(int n, - const T *x, - const T *y, - T *z) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VSUB(n, x, y, z); -#else - // try to find if openblas support vsub - for (int i = 0; i < n; ++i) { - z[i] = x[i] - y[i]; - } -#endif -} template <> template void Blas::VSUB(int n, const T *x, const T *y, T *z) const { @@ -1410,21 +1230,6 @@ void Blas::VSUB(int n, const T *x, const T *y, T *z) const { #endif } -template <> -template -void Blas::VMUL(int n, - const T *x, - const T *y, - T *z) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VMUL(n, x, y, z); -#else - // try to find if openblas support vmul - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -#endif -} template <> template void Blas::VMUL(int n, const T *x, const T *y, T *z) const { @@ -1438,21 +1243,6 @@ void Blas::VMUL(int n, const T *x, const T *y, T *z) const { #endif } -template <> -template -void Blas::VDIV(int n, - const T *x, - const T *y, - T *z) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VDIV(n, x, y, z); -#else - // try to find if openblas support vdiv - for (int i = 0; i < n; ++i) { - z[i] = x[i] / y[i]; - } -#endif -} template <> template void Blas::VDIV(int n, const T *x, const T *y, T *z) const { @@ -1466,20 +1256,6 @@ void Blas::VDIV(int n, const T *x, const T *y, T *z) const { #endif } -template <> -template -void Blas::VEXP(int n, - const T *x, - T *y) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VEXP(n, x, y); -#else - // try to find if openblas support vexp - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -#endif -} template <> template void Blas::VEXP(int n, const T *x, T *y) const { @@ -1493,19 +1269,6 @@ void Blas::VEXP(int n, const T *x, T *y) const { #endif } -template <> -template -void Blas::VSQUARE(int n, - const T *x, - T *y) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VSQUARE(n, x, y); -#else - for (int i = 0; i < n; ++i) { - y[i] = x[i] * x[i]; - } -#endif -} template <> template void Blas::VSQUARE(int n, const T *x, T *y) const { @@ -1518,20 +1281,6 @@ void Blas::VSQUARE(int n, const T *x, T *y) const { #endif } -template <> -template -void Blas::VPOW(int n, - const T *x, - T a, - T *y) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VPOW(n, x, a, y); -#else - for (int i = 0; i < n; ++i) { - y[i] = std::pow(x[i], a); - } -#endif -} template <> template void Blas::VPOW(int n, const T *x, T a, T *y) const { @@ -1544,22 +1293,6 @@ void Blas::VPOW(int n, const T *x, T a, T *y) const { #endif } -template <> -template -T Blas::DOT(int n, - const T *x, - const T *y) const { -#ifdef PADDLE_WITH_MKLML - return CBlas::DOT(n, x, 1, y, 1); -#else - // try to find if openblas support cblas_dot - T sum = 0; - for (int i = 0; i < n; ++i) { - sum += x[i] * y[i]; - } - return sum; -#endif -} template <> template T Blas::DOT(int n, const T *x, const T *y) const { @@ -1575,20 +1308,6 @@ T Blas::DOT(int n, const T *x, const T *y) const { #endif } -template <> -template -void Blas::SCAL(int n, - const T a, - T *x) const { -#ifdef PADDLE_WITH_MKLML - CBlas::SCAL(n, a, x, 1); -#else - // try to find if openblas support cblas_scal - for (int i = 0; i < n; ++i) { - x[i] = a * x[i]; - } -#endif -} template <> template void Blas::SCAL(int n, const T a, T *x) const { @@ -1602,20 +1321,6 @@ void Blas::SCAL(int n, const T a, T *x) const { #endif } -template <> -template -T Blas::ASUM(int n, T *x, int inc) const { - auto sum = static_cast(0.0); -#ifdef PADDLE_WITH_MKLML - sum = CBlas::ASUM(n, x, inc); -#else - // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum - for (int c = 0; c < n; ++c) { - sum += x[c]; - } -#endif - return sum; -} template <> template T Blas::ASUM(int n, T *x, int inc) const { @@ -1631,19 +1336,6 @@ T Blas::ASUM(int n, T *x, int inc) const { return sum; } -template <> -template -void Blas::GEMV(bool trans_a, - int M, - int N, - T alpha, - const T *A, - const T *B, - T beta, - T *C) const { - CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans; - CBlas::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); -} template <> template void Blas::GEMV(bool trans_a, @@ -1658,66 +1350,6 @@ void Blas::GEMV(bool trans_a, CBlas::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); } -template <> -template -void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T *A, - const T *B, - T beta, - T *C, - int batchCount, - int64_t strideA, - int64_t strideB) const { - PADDLE_ENFORCE_NOT_NULL( - A, phi::errors::InvalidArgument("Pointer A should not be null.")); - PADDLE_ENFORCE_NOT_NULL( - B, phi::errors::InvalidArgument("Pointer B should not be null.")); - PADDLE_ENFORCE_NOT_NULL( - C, phi::errors::InvalidArgument("Pointer C should not be null.")); -#ifdef PADDLE_WITH_MKLML - int lda = (transA == CblasNoTrans) ? K : M; - int ldb = (transB == CblasNoTrans) ? N : K; - int ldc = N; - auto a_array = std::vector(batchCount); - auto b_array = std::vector(batchCount); - auto c_array = std::vector(batchCount); - for (int k = 0; k < batchCount; ++k) { - a_array[k] = &A[k * strideA]; - b_array[k] = &B[k * strideB]; - c_array[k] = &C[k * M * N]; - } - - CBlas::GEMM_BATCH(CblasRowMajor, - &transA, - &transB, - &M, - &N, - &K, - &alpha, - a_array.data(), - &lda, - b_array.data(), - &ldb, - &beta, - c_array.data(), - &ldc, - 1 /* group_count */, - &batchCount); -#else - for (int k = 0; k < batchCount; ++k) { - auto *Ak = &A[k * strideA]; - auto *Bk = &B[k * strideB]; - auto *Ck = &C[k * M * N]; - this->template GEMM(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck); - } -#endif -} template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -1778,47 +1410,6 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, #endif } -template <> -template -void Blas::BatchedGEMM( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int M, - int N, - int K, - T alpha, - const T **A, - const T **B, - T beta, - T **C, - int batchCount) const { -#ifdef PADDLE_WITH_MKLML - const int lda = (std::max)((transA == CblasNoTrans) ? K : M, 1); - const int ldb = (std::max)((transB == CblasNoTrans) ? N : K, 1); - const int ldc = (std::max)(N, 1); - CBlas::GEMM_BATCH(CblasRowMajor, - &transA, - &transB, - &M, - &N, - &K, - &alpha, - A, - &lda, - B, - &ldb, - &beta, - C, - &ldc, - 1 /* group_count */, - &batchCount); -#else - for (int k = 0; k < batchCount; ++k) { - this->template GEMM( - transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]); - } -#endif -} template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -1864,113 +1455,6 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, !defined(PADDLE_WITH_HIP) // @{ Group Blas MKLML: BatchedGEMMWithHead template <> template -void Blas::BatchedGEMMWithHead( - CBLAS_TRANSPOSE transA, - CBLAS_TRANSPOSE transB, - int W1, - int H1, - int W2, - int H2, - T alpha, - const T *A, - const T *B, - T beta, - T *C, - int batchCount, - int64_t strideA, - int64_t strideB, - int64_t head_number, - bool split_b_vertical) const { - int lda = (transA == CblasNoTrans) ? W1 : H1; - int ldb = (transB == CblasNoTrans) ? W2 : H2; - auto a_array = std::vector(batchCount); - auto b_array = std::vector(batchCount); - auto c_array = std::vector(batchCount); - - if (split_b_vertical) { - int ldc = W2; - int sub_width = W2 / head_number; - - for (int i = 0; i < head_number; i++) { - int sub_matA_offset = (transA == CblasNoTrans) - ? i * (W1 / head_number) - : i * (W1 / head_number) * H1; - int sub_matB_offset = (transB == CblasNoTrans) - ? i * (W2 / head_number) - : i * (W2 / head_number) * H2; - int sub_matC_offset = i * W2 / head_number; - for (int k = 0; k < batchCount; ++k) { - a_array[k] = &A[k * strideA] + sub_matA_offset; - b_array[k] = &B[k * strideB] + sub_matB_offset; - c_array[k] = &C[k * H1 * W2] + sub_matC_offset; - } - - CBlas::GEMM_BATCH(CblasRowMajor, - &transA, - &transB, - &H1, - &sub_width, - &H2, - &alpha, - a_array.data(), - &lda, - b_array.data(), - &ldb, - &beta, - c_array.data(), - &ldc, - 1 /* group_count */, - &batchCount); - } - - } else { - PADDLE_ENFORCE_EQ( - W1, - H2, - phi::errors::InvalidArgument( - "The fisrt matrix width should be same as second matrix height," - "but received fisrt matrix width %d" - ", second matrix height %d", - W1, - H2)); - int ldc = W2 * head_number; - int sub_width = W1 / head_number; - - for (int i = 0; i < head_number; i++) { - int sub_matA_offset = (transA == CblasNoTrans) - ? i * (W1 / head_number) - : i * (W1 / head_number) * H1; - int sub_matB_offset = (transB == CblasNoTrans) - ? i * (W1 / head_number) * W2 - : i * (W1 / head_number); - int sub_matC_offset = i * W2; - for (int k = 0; k < batchCount; ++k) { - a_array[k] = &A[k * strideA] + sub_matA_offset; - b_array[k] = &B[k * strideB] + sub_matB_offset; - c_array[k] = &C[k * H1 * head_number * W2] + sub_matC_offset; - } - - CBlas::GEMM_BATCH(CblasRowMajor, - &transA, - &transB, - &H1, - &W2, - &sub_width, - &alpha, - a_array.data(), - &lda, - b_array.data(), - &ldb, - &beta, - c_array.data(), - &ldc, - 1 /* group_count */, - &batchCount); - } - } -} -template <> -template void Blas::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int W1, @@ -2097,43 +1581,6 @@ void Blas::MatMul( N); } -template <> -template -void Blas::MatMul( - const int M, const int N, const int K, const T *A, const T *B, T *C) const { -#ifdef PADDLE_WITH_LIBXSMM - // Refer to https://github.com/hfp/libxsmm/blob/master/README.md - // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20; - - // Since the matrix is very small, - // so the unit of calculation is already very fast, - // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead, - // use xsmm directly. - // Note: SMM use ColMajor - const char transa = 'N'; - const char transb = 'N'; - const T alpha = static_cast(1); - const T beta = static_cast(0); - CBlas::SMM_GEMM( - &transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta, C, &N); - return; -#endif - - CBlas::GEMM(CblasRowMajor, - CblasNoTrans, - CblasNoTrans, - M, - N, - K, - static_cast(1), - A, - K, - B, - N, - static_cast(0), - C, - N); -} template <> template void Blas::MatMul( @@ -2425,20 +1872,6 @@ void Blas::VINV(int n, const T *a, T *y) const { #endif } -template <> -template -void Blas::VMERF(int n, - const T *a, - T *y, - int64_t mode) const { -#ifdef PADDLE_WITH_MKLML - CBlas::VMERF(n, a, y, mode); -#else - for (int i = 0; i < n; ++i) { - y[i] = std::erf(a[i]); - } -#endif -} template <> template void Blas::VMERF(int n, const T *a, T *y, int64_t mode) const { @@ -2454,39 +1887,6 @@ void Blas::VMERF(int n, const T *a, T *y, int64_t mode) const { #ifdef PADDLE_WITH_MKLML template <> template -void Blas::CSRMM(const char *transa, - const int *m, - const int *n, - const int *k, - const T *alpha, - const char *matdescra, - const T *val, - const int *indx, - const int *pntrb, - const int *pntre, - const T *b, - const int *ldb, - const T *beta, - T *c, - const int *ldc) const { - CBlas::CSRMM(transa, - m, - n, - k, - alpha, - matdescra, - val, - indx, - pntrb, - pntre, - b, - ldb, - beta, - c, - ldc); -} -template <> -template void Blas::CSRMM(const char *transa, const int *m, const int *n, @@ -2520,22 +1920,6 @@ void Blas::CSRMM(const char *transa, } #endif -template <> -template -void Blas::TRSM(CBLAS_SIDE side, - CBLAS_UPLO uplo, - CBLAS_TRANSPOSE transA, - CBLAS_DIAG diag, - int M, - int N, - T alpha, - const T *A, - int lda, - T *B, - int ldb) const { - CBlas::TRSM( - CblasRowMajor, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb); -} template <> template void Blas::TRSM(CBLAS_SIDE side, diff --git a/paddle/phi/kernels/funcs/fc_functor.cc b/paddle/phi/kernels/funcs/fc_functor.cc index 0fb38c971abf5..0434483be1326 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cc +++ b/paddle/phi/kernels/funcs/fc_functor.cc @@ -96,8 +96,6 @@ void FCFunctor::operator()(const DeviceContext& context, } } -template class FCFunctor; -template class FCFunctor; template class FCFunctor; template class FCFunctor; diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h index bf0888c301fe7..78066ce5b2f5f 100644 --- a/paddle/phi/kernels/funcs/for_range.h +++ b/paddle/phi/kernels/funcs/for_range.h @@ -41,22 +41,6 @@ struct ForRange { size_t limit_; }; -// NOTE: After the pten kernel is migrated, it needs to be deleted. -template <> -struct ForRange { - ForRange(const paddle::platform::CPUDeviceContext& dev_ctx, size_t limit) - : dev_ctx_(dev_ctx), limit_(limit) {} - - template - void operator()(Function func) const { - phi::funcs::ForRange for_range(dev_ctx_, limit_); - for_range(func); - } - - const paddle::platform::CPUDeviceContext& dev_ctx_; - size_t limit_; -}; - #if defined(__NVCC__) || defined(__HIPCC__) template diff --git a/paddle/phi/kernels/funcs/gru_compute.cc b/paddle/phi/kernels/funcs/gru_compute.cc index 8cda2e9062ae1..c081a9ed97d1f 100644 --- a/paddle/phi/kernels/funcs/gru_compute.cc +++ b/paddle/phi/kernels/funcs/gru_compute.cc @@ -179,60 +179,6 @@ struct GRUUnitGradFunctor { } }; -template -struct GRUUnitFunctorV2 { - static void compute(const paddle::platform::CPUDeviceContext &context, - GRUMetaValue value, - int frame_size, - int batch_size, - const phi::funcs::detail::ActivationType active_node, - const phi::funcs::detail::ActivationType active_gate) { -#if !defined(__NVCC__) && !defined(__HIPCC___) - auto blas = - phi::funcs::GetBlas(context); - if (value.prev_out_value) { - blas.GEMM(CblasNoTrans, - CblasTrans, - batch_size, - frame_size, - frame_size, - 1, - value.prev_out_value, - value.state_weight, - 0, - value.reset_output_value); - } - detail::forward_reset_output( - phi::funcs::detail::forward::gru_resetOutput(), - value, - frame_size, - batch_size, - active_gate, - false, - &context); - - T *cell_state_value = value.gate_value + 2 * frame_size; - T *reset_output_value = value.reset_output_value; - for (int b = 0; b < batch_size; ++b) { - blas.VADD( - frame_size, cell_state_value, reset_output_value, cell_state_value); - cell_state_value += frame_size * 3; - reset_output_value += frame_size; - } - - detail::forward_final_output( - phi::funcs::detail::forward::gru_finalOutput(), - value, - frame_size, - batch_size, - active_node, - true, - false, - &context); -#endif - } -}; - template struct GRUUnitFunctorV2 { static void compute(const CPUContext &context, @@ -286,131 +232,6 @@ struct GRUUnitFunctorV2 { } }; -template -struct GRUUnitGradFunctorV2 { - static void compute(const paddle::platform::CPUDeviceContext &context, - GRUMetaValue value, - GRUMetaGrad grad, - int frame_size, - int batch_size, - const phi::funcs::detail::ActivationType active_node, - const phi::funcs::detail::ActivationType active_gate) { -#if !defined(__NVCC__) && !defined(__HIPCC___) - // calculate grad_update_gate, grad_frame_state, - // grad_reset_output, grad_reset_gate - detail::cpu_gru_backward(context, - phi::funcs::detail::backward::gru(), - value, - grad, - frame_size, - batch_size, - active_node, - active_gate); - auto blas = - phi::funcs::GetBlas(context); - if (grad.prev_out_grad && value.prev_out_value) { - // update prev_out_grad - blas.GEMM(false, - false, - batch_size, - frame_size, - frame_size, - 1, - grad.gate_grad, - frame_size * 3, - value.gate_weight, - frame_size, - 1, - grad.prev_out_grad, - frame_size); - blas.GEMM(false, - false, - batch_size, - frame_size, - frame_size, - 1, - grad.gate_grad + frame_size, - frame_size * 3, - value.gate_weight + frame_size * frame_size, - frame_size, - 1, - grad.prev_out_grad, - frame_size); - blas.GEMM(false, - false, - batch_size, - frame_size, - frame_size, - 1, - grad.reset_output_grad, - frame_size, - value.state_weight, - frame_size, - 1, - grad.prev_out_grad, - frame_size); - // update weight_hh_grad - if (grad.gate_weight_grad) { - // reset gate - blas.GEMM(true, - false, - frame_size, - frame_size, - batch_size, - 1, - grad.gate_grad, - frame_size * 3, - value.prev_out_value, - frame_size, - 1, - grad.gate_weight_grad, - frame_size); - // update gate - blas.GEMM(true, - false, - frame_size, - frame_size, - batch_size, - 1, - grad.gate_grad + frame_size, - frame_size * 3, - value.prev_out_value, - frame_size, - 1, - grad.gate_weight_grad + frame_size * frame_size, - frame_size); - // cell state - blas.GEMM(true, - false, - frame_size, - frame_size, - batch_size, - 1, - grad.reset_output_grad, - frame_size, - value.prev_out_value, - frame_size, - 1, - grad.state_weight_grad, - frame_size); - } - } - // update bias_hh_grad - T *gate_grad = grad.gate_grad; - T *bias_hh_grad = grad.bias_hh_grad; - T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size; - T *reset_output_grad = grad.reset_output_grad; - for (int b = 0; b < batch_size; ++b) { - blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad); - blas.VADD( - frame_size, state_bias_grad, reset_output_grad, state_bias_grad); - gate_grad += 3 * frame_size; - reset_output_grad += frame_size; - } -#endif - } -}; - template struct GRUUnitGradFunctorV2 { static void compute(const CPUContext &context, @@ -540,12 +361,6 @@ template struct GRUUnitFunctor; template struct GRUUnitGradFunctor; template struct GRUUnitGradFunctor; -template struct GRUUnitFunctorV2; -template struct GRUUnitFunctorV2; -template struct GRUUnitGradFunctorV2; -template struct GRUUnitGradFunctorV2; - template struct GRUUnitFunctorV2; template struct GRUUnitFunctorV2; template struct GRUUnitGradFunctorV2; diff --git a/paddle/phi/kernels/funcs/lstm_compute.cc b/paddle/phi/kernels/funcs/lstm_compute.cc index 45d0b2e40b4f3..e4b8a6961fd7e 100644 --- a/paddle/phi/kernels/funcs/lstm_compute.cc +++ b/paddle/phi/kernels/funcs/lstm_compute.cc @@ -21,38 +21,6 @@ limitations under the License. */ namespace phi { namespace funcs { -template -struct LstmUnitFunctor { - static void compute(const paddle::platform::CPUDeviceContext& context, - LstmMetaValue value, - int frame_size, - int batch_size, - T cell_clip, - const phi::funcs::detail::ActivationType& gate_act, - const phi::funcs::detail::ActivationType& cell_act, - const phi::funcs::detail::ActivationType& cand_act, - bool old_api_version = true) { - for (int b = 0; b < batch_size; b++) { - detail::cpu_lstm_forward(context, - phi::funcs::detail::forward::lstm(), - value, - frame_size, - cell_clip, - cand_act, - gate_act, - cell_act, - old_api_version); - value.gate_value += frame_size * 4; - value.state_value += frame_size; - value.state_active_value += frame_size; - value.output_value += frame_size; - if (value.prev_state_value) { - value.prev_state_value += frame_size; - } - } - } -}; - template struct LstmUnitFunctor { static void compute(const CPUContext& context, @@ -85,49 +53,6 @@ struct LstmUnitFunctor { } }; -template -struct LstmUnitGradFunctor { - static void compute(const paddle::platform::CPUDeviceContext& context, - LstmMetaValue value, - LstmMetaGrad grad, - int frame_size, - int batch_size, - T cell_clip, - const phi::funcs::detail::ActivationType& gate_act, - const phi::funcs::detail::ActivationType& cell_act, - const phi::funcs::detail::ActivationType& cand_act, - bool old_api_version = true) { - for (int b = 0; b < batch_size; b++) { - detail::cpu_lstm_backward(context, - phi::funcs::detail::backward::lstm(), - value, - grad, - frame_size, - cell_clip, - cand_act, - gate_act, - cell_act, - old_api_version); - - value.gate_value += frame_size * 4; - value.state_value += frame_size; - value.state_active_value += frame_size; - value.output_value += frame_size; - if (value.prev_state_value) { - value.prev_state_value += frame_size; - } - - grad.gate_grad += frame_size * 4; - grad.state_grad += frame_size; - grad.state_active_grad += frame_size; - grad.output_grad += frame_size; - if (grad.prev_state_grad) { - grad.prev_state_grad += frame_size; - } - } - } -}; - template struct LstmUnitGradFunctor { static void compute(const CPUContext& context, @@ -171,11 +96,6 @@ struct LstmUnitGradFunctor { } }; -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; - template class LstmUnitFunctor; template class LstmUnitFunctor; template class LstmUnitGradFunctor; diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc index 033c50e537da6..042b333ad451a 100644 --- a/paddle/phi/kernels/funcs/math_function.cc +++ b/paddle/phi/kernels/funcs/math_function.cc @@ -39,22 +39,6 @@ namespace funcs { using float16 = phi::dtype::float16; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant>; -template struct SetConstant>; - template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -85,46 +69,20 @@ template struct SetConstant>; #endif -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose, \ - RANK>; \ - template struct Transpose, \ - RANK>; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose, \ - RANK>; \ +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, \ + RANK>; \ template struct Transpose, RANK>; DEFINE_CPU_TRANS(1); @@ -163,8 +121,7 @@ void TransposeNormal::operator()( } // define transpose normal -#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal; \ +#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ template struct TransposeNormal DEFINE_CPU_TRANS_NORMAL(phi::dtype::float16); @@ -291,6 +248,31 @@ void set_constant(const paddle::platform::DeviceContext& context, #endif } +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; +template struct ColwiseSum; + +template struct RowwiseMean; +template struct RowwiseMean; + +template +struct ElementwiseAddTo { + void operator()(paddle::platform::CPUDeviceContext* ctx, + const paddle::framework::Tensor& src, + paddle::framework::Tensor* dst) { + auto in = paddle::framework::EigenVector::Flatten(src); + auto out = paddle::framework::EigenVector::Flatten(*dst); + auto& place = *(ctx->eigen_device()); + out.device(place) = out + in; + } +}; + +template struct ElementwiseAddTo; +template struct ElementwiseAddTo; + template struct RowwiseAdd { void operator()(const paddle::platform::CPUDeviceContext& context, @@ -333,41 +315,5 @@ struct RowwiseAdd { template struct RowwiseAdd; template struct RowwiseAdd; -template struct ColwiseSum; -template struct ColwiseSum; -template struct ColwiseSum; -template struct ColwiseSum; - -template struct ColwiseSum; -template struct ColwiseSum; -template struct ColwiseSum; -template struct ColwiseSum; - -template struct RowwiseSum; -template struct RowwiseSum; - -template struct RowwiseMean; -template struct RowwiseMean; - -template struct RowwiseMean; -template struct RowwiseMean; - -template -struct ElementwiseAddTo { - void operator()(paddle::platform::CPUDeviceContext* ctx, - const paddle::framework::Tensor& src, - paddle::framework::Tensor* dst) { - auto in = paddle::framework::EigenVector::Flatten(src); - auto out = paddle::framework::EigenVector::Flatten(*dst); - auto& place = *(ctx->eigen_device()); - out.device(place) = out + in; - } -}; - -template struct ElementwiseAddTo; -template struct ElementwiseAddTo; - } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc index c95e97f8ea81a..c316970e6a560 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cc +++ b/paddle/phi/kernels/funcs/matrix_inverse.cc @@ -29,9 +29,5 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, template class MatrixInverseFunctor; template class MatrixInverseFunctor; -// TODO(chenweihang): remove these instantiations later -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - } // namespace funcs } // namespace phi diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc index e02017555111c..d5891baaf10a2 100644 --- a/paddle/phi/tests/api/test_sparse_utils_api.cc +++ b/paddle/phi/tests/api/test_sparse_utils_api.cc @@ -48,7 +48,6 @@ TEST(API, to_sparse_coo) { std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data); phi::CPUContext dev_ctx_cpu; - dev_ctx_cpu.Init(); // 1. test dense_to_sparse_coo paddle::experimental::Tensor x(dense_x); diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu index 50b9e198da08b..95334ac36a608 100644 --- a/paddle/phi/tests/common/test_scalar.cu +++ b/paddle/phi/tests/common/test_scalar.cu @@ -47,7 +47,6 @@ TEST(Scalar, ConstructFromDenseTensor1) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); - dev_ctx.Init(); auto* dense_x_data = dev_ctx.Alloc(&dense_x); dense_x_data[0] = 1; @@ -67,7 +66,6 @@ TEST(Scalar, ConstructFromDenseTensor2) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); - dev_ctx.Init(); auto* dense_x_data = dev_ctx.Alloc(&dense_x); dense_x_data[0] = 1; @@ -87,7 +85,6 @@ TEST(Scalar, ConstructFromDenseTensor3) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); - dev_ctx.Init(); auto* dense_x_data = dev_ctx.Alloc(&dense_x); dense_x_data[0] = 1; @@ -107,7 +104,6 @@ TEST(Scalar, ConstructFromDenseTensor4) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); - dev_ctx.Init(); auto* dense_x_data = dev_ctx.Alloc(&dense_x); dense_x_data[0] = true; @@ -127,7 +123,6 @@ TEST(Scalar, ConstructFromDenseTensor5) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); - dev_ctx.Init(); auto* dense_x_data = dev_ctx.Alloc(&dense_x); dense_x_data[0] = 1; @@ -148,7 +143,6 @@ TEST(Scalar, ConstructFromDenseTensor6) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); - dev_ctx.Init(); auto* dense_x_data = dev_ctx.Alloc(&dense_x); dense_x_data[0] = 1; @@ -170,7 +164,6 @@ TEST(Scalar, ConstructFromDenseTensor7) { .GetAllocator(phi::GPUPlace()) .get()); dev_ctx.Init(); - auto* dense_x_data = dev_ctx.Alloc(&dense_x); FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); dev_ctx.Wait(); diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt index c299559da5914..3d549aa5f160c 100644 --- a/paddle/phi/tests/core/CMakeLists.txt +++ b/paddle/phi/tests/core/CMakeLists.txt @@ -24,10 +24,6 @@ cc_test( test_op_utils SRCS test_op_utils.cc DEPS op_compat_infos) -cc_test( - test_phi_device_context - SRCS test_device_context.cc - DEPS phi_context cpu_context) cc_test( test_meta_fn_utils SRCS test_meta_fn_utils.cc diff --git a/paddle/phi/tests/core/test_device_context.cc b/paddle/phi/tests/core/test_device_context.cc deleted file mode 100644 index 844330ee097ef..0000000000000 --- a/paddle/phi/tests/core/test_device_context.cc +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "gtest/gtest.h" - -// TODO(wilber): will remove after the cpu, gpu context megre. -#include "paddle/phi/backends/cpu/cpu_context.h" -// #include "paddle/phi/backends/all_context.h" - -// NOTE: The paddle framework should add WITH_EIGEN option to support compile -// without eigen. -#include "unsupported/Eigen/CXX11/Tensor" - -namespace phi { -namespace tests { - -class InferenceCPUContext : public CPUContext { - public: - void SetEigenDevice(Eigen::DefaultDevice* eigen_device) { - CPUContext::SetEigenDevice(eigen_device); - } -}; - -TEST(DeviceContext, cpu_context) { - std::cout << "test training scenarios" << std::endl; - { - phi::CPUContext ctx; - ctx.Init(); - EXPECT_TRUE(ctx.eigen_device() != nullptr); - } - - std::cout << "test inference scenarios" << std::endl; - Eigen::DefaultDevice* device = new Eigen::DefaultDevice(); - { - InferenceCPUContext ctx; - ctx.SetEigenDevice(device); - EXPECT_TRUE(ctx.eigen_device() != nullptr); - } - delete device; -} - -} // namespace tests -} // namespace phi diff --git a/paddle/phi/tests/kernels/test_cast_dev_api.cc b/paddle/phi/tests/kernels/test_cast_dev_api.cc index 179e44f0f0f12..d43cd075ed590 100644 --- a/paddle/phi/tests/kernels/test_cast_dev_api.cc +++ b/paddle/phi/tests/kernels/test_cast_dev_api.cc @@ -52,7 +52,6 @@ TEST(DEV_API, cast) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); phi::DataType out_dtype = phi::DataType::FLOAT64; // 2. test API diff --git a/paddle/phi/tests/kernels/test_concat_dev_api.cc b/paddle/phi/tests/kernels/test_concat_dev_api.cc index 0dd58b1bba938..9283fcd0b65f4 100644 --- a/paddle/phi/tests/kernels/test_concat_dev_api.cc +++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc @@ -60,7 +60,6 @@ TEST(DEV_API, concat) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto out = phi::Concat(dev_ctx, inputs, 0); // 3. check result diff --git a/paddle/phi/tests/kernels/test_conj_dev_api.cc b/paddle/phi/tests/kernels/test_conj_dev_api.cc index 5ac676ffcbcae..2f7ab8383733f 100644 --- a/paddle/phi/tests/kernels/test_conj_dev_api.cc +++ b/paddle/phi/tests/kernels/test_conj_dev_api.cc @@ -48,7 +48,6 @@ TEST(DEV_API, conj) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); // 2. test API auto out = phi::Conj(dev_ctx, dense_x); diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc index 1c9b17ed613e4..c2df0a8acdccf 100644 --- a/paddle/phi/tests/kernels/test_copy_dev_api.cc +++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc @@ -65,7 +65,6 @@ TEST(DEV_API, copy) { paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); phi::Copy( dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get()); diff --git a/paddle/phi/tests/kernels/test_creation_dev_api.cc b/paddle/phi/tests/kernels/test_creation_dev_api.cc index 2dcd8739991f8..5685c3a2a0b0d 100644 --- a/paddle/phi/tests/kernels/test_creation_dev_api.cc +++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc @@ -36,7 +36,6 @@ TEST(DEV_API, empty) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); // 2. test API auto out = phi::Empty(dev_ctx, {3, 2}); @@ -66,7 +65,6 @@ TEST(DEV_API, empty_like) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto out = phi::EmptyLike(dev_ctx, dense_x); // 3. check result @@ -86,7 +84,6 @@ TEST(DEV_API, full) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto out = phi::Full(dev_ctx, {3, 2}, val); // 3. check result @@ -119,7 +116,6 @@ TEST(DEV_API, full_like) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); // 2. test API auto out = phi::FullLike(dev_ctx, dense_x, val); diff --git a/paddle/phi/tests/kernels/test_dot_dev_api.cc b/paddle/phi/tests/kernels/test_dot_dev_api.cc index de20907cadf44..a2af0471df0d0 100644 --- a/paddle/phi/tests/kernels/test_dot_dev_api.cc +++ b/paddle/phi/tests/kernels/test_dot_dev_api.cc @@ -61,7 +61,6 @@ TEST(DEV_API, dot) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto out = phi::Dot(dev_ctx, dense_x, dense_y); // 3. check result diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc index 63f8b86a534ed..4100889d3ac41 100644 --- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc +++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc @@ -66,7 +66,6 @@ TEST(DEV_API, add) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto dense_out = phi::Add(dev_ctx, dense_x, dense_y); // 3. check result @@ -118,7 +117,6 @@ TEST(DEV_API, subtract) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto dense_out = phi::Subtract(dev_ctx, dense_x, dense_y); // 3. check result @@ -170,7 +168,6 @@ TEST(DEV_API, divide) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto dense_out = phi::Divide(dev_ctx, dense_x, dense_y); // 3. check result @@ -222,7 +219,6 @@ TEST(DEV_API, multiply) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto dense_out = phi::Multiply(dev_ctx, dense_x, dense_y); // 3. check result diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc index fb1cdee7e5fba..860af4c4a4dce 100644 --- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc +++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc @@ -52,7 +52,6 @@ TEST(DEV_API, flatten) { paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); // 2. test API auto out = phi::Flatten(dev_ctx, dense_x, start_axis, stop_axis); diff --git a/paddle/phi/tests/kernels/test_math_function.cc b/paddle/phi/tests/kernels/test_math_function.cc index 29f33c555d1aa..a13a8cb564f94 100644 --- a/paddle/phi/tests/kernels/test_math_function.cc +++ b/paddle/phi/tests/kernels/test_math_function.cc @@ -273,7 +273,6 @@ TEST(math_funciton, set_constant) { t.Resize({10, 10}); t.mutable_data(paddle::platform::CPUPlace()); auto* ctx = new paddle::platform::CPUDeviceContext(); - ctx->Init(); phi::funcs::set_constant(*ctx, &t, 10); for (int64_t i = 0; i < t.numel(); ++i) { PADDLE_ENFORCE_EQ(10, diff --git a/paddle/phi/tests/kernels/test_matmul_dev_api.cc b/paddle/phi/tests/kernels/test_matmul_dev_api.cc index f25acaf9bcc3f..374a05fc5e475 100644 --- a/paddle/phi/tests/kernels/test_matmul_dev_api.cc +++ b/paddle/phi/tests/kernels/test_matmul_dev_api.cc @@ -58,7 +58,6 @@ TEST(DEV_API, dot) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto out = Matmul(dev_ctx, dense_x, dense_y, false, false); // 3. check result diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc index 6f3f91a7dbe56..1c79150391379 100644 --- a/paddle/phi/tests/kernels/test_mean_dev_api.cc +++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc @@ -51,7 +51,6 @@ TEST(DEV_API, mean) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto out = phi::Mean(dev_ctx, dense_x, dims, false); // 3. check result diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc index f0f521d57dbd8..708b31cb9a9ce 100644 --- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc +++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc @@ -54,7 +54,6 @@ TEST(DEV_API, reshape) { paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto out = phi::Reshape(dev_ctx, dense_x, shape); // 3. check result std::vector expect_shape = {12, 3}; diff --git a/paddle/phi/tests/kernels/test_scale_dev_api.cc b/paddle/phi/tests/kernels/test_scale_dev_api.cc index eff18bdeecaab..57e186ab393ec 100644 --- a/paddle/phi/tests/kernels/test_scale_dev_api.cc +++ b/paddle/phi/tests/kernels/test_scale_dev_api.cc @@ -51,7 +51,6 @@ TEST(DEV_API, scale) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto out = phi::Scale(dev_ctx, dense_x, scale, bias, bias_after_scale); @@ -93,7 +92,6 @@ TEST(DEV_API, scale_host) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto out = phi::Scale(dev_ctx, dense_x, scale, bias, bias_after_scale); diff --git a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc index d1c464e4b1c9d..51d1e67f5af2a 100644 --- a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc @@ -42,7 +42,6 @@ TEST(DEV_API, sparse_relu) { paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx_cpu.Init(); DenseTensor dense_x = phi::Empty(dev_ctx_cpu, diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index bb84690cd07ee..f08c7b0872b93 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -75,7 +75,6 @@ void TestConv3dBase(const std::vector& indices, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx_cpu.Init(); const int in_channels = kernel_dims[3]; const int out_channels = kernel_dims[4]; diff --git a/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc index 50848ae5f1ce7..cbac854d48ea4 100644 --- a/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc @@ -113,7 +113,6 @@ TEST(DEV_API, sparse_elementwise_coo_kernel_double) { paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx_cpu.Init(); auto coo_x = sparse::DenseToSparseCoo(dev_ctx_cpu, dense_x, sparse_dim); auto coo_y = sparse::DenseToSparseCoo(dev_ctx_cpu, dense_y, sparse_dim); @@ -159,7 +158,6 @@ TEST(DEV_API, sparse_elementwise_csr_kernel_float) { paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx_cpu.Init(); auto csr_x = sparse::DenseToSparseCsr(dev_ctx_cpu, dense_x); auto csr_y = sparse::DenseToSparseCsr(dev_ctx_cpu, dense_y); @@ -357,7 +355,6 @@ TEST(DEV_API, sparse_elementwise_csr_grad_kernel_float) { paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx_cpu.Init(); auto csr_x = sparse::DenseToSparseCsr(dev_ctx_cpu, dense_x); auto csr_y = sparse::DenseToSparseCsr(dev_ctx_cpu, dense_y); @@ -404,7 +401,6 @@ TEST(DEV_API, sparse_elementwise_coo_grad_kernel_double) { paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx_cpu.Init(); auto csr_x = sparse::DenseToSparseCoo(dev_ctx_cpu, dense_x, sparse_dim); auto csr_y = sparse::DenseToSparseCoo(dev_ctx_cpu, dense_y, sparse_dim); diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc index 7d7cd1ceaf57e..460dca59c718c 100644 --- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc @@ -60,7 +60,6 @@ void TestMaxPoolBase(const std::vector& indices, paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) .get()); - dev_ctx_cpu.Init(); const int in_channels = x_dims[4]; const int out_channels = in_channels; diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc index d4f1d6efb5d93..70c9f4cfc611d 100644 --- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc @@ -88,7 +88,6 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, paddle::platform::CPUPlace()); phi::CPUContext dev_ctx_cpu; - dev_ctx_cpu.Init(); dev_ctx_cpu.SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) @@ -307,7 +306,6 @@ void TestSparseCsrToCoo(const DDim& dense_dims, // 1. test cpu phi::CPUContext dev_ctx_cpu; - dev_ctx_cpu.Init(); dev_ctx_cpu.SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) @@ -489,7 +487,6 @@ void TestCooToCsr(const DDim& dense_dims, // 1. test cpu phi::CPUContext dev_ctx_cpu; - dev_ctx_cpu.Init(); dev_ctx_cpu.SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) @@ -588,7 +585,6 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x, const auto alloc = std::make_shared( paddle::platform::CPUPlace()); phi::CPUContext dev_ctx_cpu; - dev_ctx_cpu.Init(); dev_ctx_cpu.SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) @@ -701,7 +697,6 @@ void TestSparseCooToDense(const DDim& dense_dims, const int64_t non_zero_num, const int64_t sparse_dim) { phi::CPUContext dev_ctx_cpu; - dev_ctx_cpu.Init(); dev_ctx_cpu.SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) @@ -879,7 +874,6 @@ void TestSparseCsrToDense(const DDim& dense_dims, // 1. test cpu phi::CPUContext dev_ctx_cpu; - dev_ctx_cpu.Init(); dev_ctx_cpu.SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(phi::CPUPlace()) diff --git a/paddle/phi/tests/kernels/test_split_dev_api.cc b/paddle/phi/tests/kernels/test_split_dev_api.cc index a358fcdf28db0..0389ab7afba1a 100644 --- a/paddle/phi/tests/kernels/test_split_dev_api.cc +++ b/paddle/phi/tests/kernels/test_split_dev_api.cc @@ -40,7 +40,6 @@ TEST(DEV_API, split) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); auto* dense_x_data = dev_ctx.Alloc(&dense_x); for (size_t i = 0; i < 4; ++i) { diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc index 2cd677373f4ef..20e934eb69297 100644 --- a/paddle/phi/tests/kernels/test_sum_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc @@ -49,7 +49,6 @@ TEST(DEV_API, sum) { dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(paddle::platform::CPUPlace()) .get()); - dev_ctx.Init(); // 2. test API auto out = From 755438a7372991ffc8574bf8f04f264954939e43 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Sat, 2 Jul 2022 10:38:57 -0500 Subject: [PATCH 038/250] unify cpu context, part2 (#44012) * fix init() * delete test_device_context * replace CPUDeviceContext with CPUContext * fix test_scalar * remove dot_op.cc * fix compile --- .../fluid/distributed/collective/reducer.cc | 6 +- paddle/fluid/distributed/common/utils.h | 6 +- .../ps/service/communicator/communicator.cc | 27 +++--- .../ps/service/communicator/communicator.h | 16 ++-- paddle/fluid/eager/nan_inf_utils.cc | 3 +- .../framework/data_device_transform_test.cu | 5 +- .../fluid/framework/data_layout_transform.cc | 4 +- paddle/fluid/framework/data_type_transform.cc | 4 +- .../details/broadcast_op_handle_test.h | 2 +- .../details/gather_op_handle_test.cc | 2 +- .../framework/details/nan_inf_utils_detail.cc | 14 ++-- .../details/reduce_op_handle_test.cc | 2 +- paddle/fluid/framework/lod_tensor.h | 2 +- paddle/fluid/framework/op_registry_test.cc | 31 +++---- paddle/fluid/framework/operator_test.cc | 15 ++-- paddle/fluid/framework/phi_utils.h | 2 +- .../framework/selected_rows_utils_test.cc | 2 +- paddle/fluid/framework/tensor_util.cc | 4 +- paddle/fluid/framework/tensor_util_test.cc | 37 +++++---- paddle/fluid/imperative/gloo_context.cc | 6 +- paddle/fluid/imperative/gloo_context.h | 2 +- .../fluid/imperative/gradient_accumulator.cc | 20 ++--- paddle/fluid/imperative/reducer.cc | 27 +++--- .../analysis/ir_passes/lite_subgraph_pass.cc | 2 +- .../passes/convert_to_mixed_precision.cc | 2 +- .../fluid/inference/lite/test_engine_lite.cc | 2 +- .../tensorrt/convert/test_io_converter.cc | 2 +- paddle/fluid/operators/activation_op.cc | 24 +++--- .../operators/add_position_encoding_op.cc | 11 ++- paddle/fluid/operators/affine_channel_op.cc | 2 +- paddle/fluid/operators/affine_grid_op.cc | 16 ++-- paddle/fluid/operators/allclose_op.cc | 2 +- .../operators/amp/alloc_float_status_op.cc | 2 +- .../amp/check_finite_and_unscale_op.cc | 7 +- .../operators/amp/clear_float_status_op.cc | 2 +- .../operators/amp/get_float_status_op.cc | 2 +- .../operators/amp/update_loss_scaling_op.cc | 10 +-- paddle/fluid/operators/angle_op.cc | 20 ++--- .../fluid/operators/array_to_lod_tensor_op.cc | 2 +- paddle/fluid/operators/assign_op_test.cc | 6 +- paddle/fluid/operators/attention_lstm_op.cc | 6 +- .../fluid/operators/average_accumulates_op.cc | 25 +++--- paddle/fluid/operators/batch_fc_op.cc | 7 +- .../fluid/operators/beam_search_decode_op.h | 2 +- paddle/fluid/operators/beam_search_op.cc | 11 ++- paddle/fluid/operators/bmm_op.cc | 14 ++-- paddle/fluid/operators/bpr_loss_op.cc | 2 +- paddle/fluid/operators/cast_op.cc | 2 +- paddle/fluid/operators/center_loss_op.cc | 2 +- .../operators/cinn/cinn_instruction_run_op.cc | 6 +- paddle/fluid/operators/cinn/cinn_launch_op.cc | 5 +- paddle/fluid/operators/clip_by_norm_op.cc | 5 +- paddle/fluid/operators/coalesce_tensor_op.cc | 25 +++--- .../operators/collective/allreduce_op.cc | 13 ++- paddle/fluid/operators/complex_op.cc | 16 ++-- paddle/fluid/operators/complex_view_op.cc | 16 ++-- paddle/fluid/operators/cos_sim_op.cc | 8 +- paddle/fluid/operators/crf_decoding_op.cc | 7 +- paddle/fluid/operators/crop_op.cc | 14 ++-- paddle/fluid/operators/crop_tensor_op.cc | 22 +++-- paddle/fluid/operators/cross_entropy_op.cc | 2 +- paddle/fluid/operators/ctc_align_op.cc | 7 +- paddle/fluid/operators/cum_op.cc | 2 +- paddle/fluid/operators/data_norm_op.cc | 20 ++--- .../operators/deformable_psroi_pooling_op.cc | 2 +- .../fluid/operators/dequantize_abs_max_op.cc | 10 +-- paddle/fluid/operators/dequantize_log_op.cc | 8 +- .../operators/detection/bipartite_match_op.cc | 6 +- .../fluid/operators/detection/box_clip_op.cc | 7 +- .../fluid/operators/detection/box_clip_op.h | 3 +- .../fluid/operators/detection/box_coder_op.cc | 7 +- .../detection/box_decoder_and_assign_op.cc | 7 +- .../detection/generate_mask_labels_op.cc | 6 +- .../detection/generate_proposal_labels_op.cc | 37 ++++----- .../detection/generate_proposals_op.cc | 9 +- .../detection/generate_proposals_v2_op.cc | 9 +- .../operators/detection/iou_similarity_op.cc | 7 +- .../detection/locality_aware_nms_op.cc | 2 +- .../detection/mine_hard_examples_op.cc | 7 +- .../operators/detection/multiclass_nms_op.cc | 4 +- .../retinanet_detection_output_op.cc | 2 +- .../detection/rpn_target_assign_op.cc | 30 ++++--- .../detection/sigmoid_focal_loss_op.cc | 12 ++- .../operators/detection/target_assign_op.cc | 17 ++-- paddle/fluid/operators/determinant_op.cc | 16 ++-- paddle/fluid/operators/dgc_clip_by_norm_op.cc | 5 +- paddle/fluid/operators/diag_embed_op.cc | 11 ++- paddle/fluid/operators/diag_op.cc | 11 ++- paddle/fluid/operators/dirichlet_op.cc | 17 ++-- paddle/fluid/operators/dropout_op_test.cc | 2 +- paddle/fluid/operators/eig_op.cc | 22 ++--- paddle/fluid/operators/eig_op.h | 2 +- paddle/fluid/operators/eigvals_op.cc | 13 ++- paddle/fluid/operators/eigvalsh_op.cc | 27 +++--- paddle/fluid/operators/expand_as_op.cc | 24 +++--- paddle/fluid/operators/expand_op.cc | 24 +++--- paddle/fluid/operators/exponential_op.cc | 14 ++-- paddle/fluid/operators/fake_dequantize_op.cc | 18 ++-- paddle/fluid/operators/fake_quantize_op.cc | 65 +++++++-------- paddle/fluid/operators/fc_op.cc | 7 +- paddle/fluid/operators/fill_any_op.cc | 26 +++--- .../fill_constant_batch_size_like_op_mlu.cc | 4 +- .../fill_constant_batch_size_like_op_npu.cc | 4 +- paddle/fluid/operators/fill_constant_op.h | 4 +- paddle/fluid/operators/fill_zeros_like_op.cc | 30 ++++--- paddle/fluid/operators/flatten_op.cc | 60 +++++++------- paddle/fluid/operators/fold_op.cc | 14 ++-- paddle/fluid/operators/frame_op.cc | 28 +++---- paddle/fluid/operators/fsp_op.cc | 14 ++-- .../fused/fused_elemwise_activation_op.cc | 24 ++---- .../fused/fused_embedding_fc_lstm_op.cc | 4 +- .../fused/fused_embedding_seq_pool_op.h | 4 +- paddle/fluid/operators/fused/fusion_gru_op.cc | 4 +- .../fluid/operators/fused/fusion_lstm_op.cc | 34 ++++---- .../fused/fusion_seqconv_eltadd_relu_op.cc | 2 +- .../fused/fusion_seqexpand_concat_fc_op.cc | 4 +- .../fused/mkldnn/fusion_gru_mkldnn_op.cc | 2 +- .../fused/mkldnn/fusion_lstm_mkldnn_op.cc | 2 +- .../fused/mkldnn/fusion_rnn_mkldnn.h | 4 +- .../fused/mkldnn/multi_gru_mkldnn_op.cc | 2 +- .../fluid/operators/fused_softmax_mask_op.cc | 7 +- .../fused_softmax_mask_upper_triangle_op.cc | 11 +-- paddle/fluid/operators/gather_test.cc | 2 +- .../fluid/operators/graph_khop_sampler_op.cc | 2 +- paddle/fluid/operators/gru_op.cc | 9 +- paddle/fluid/operators/gru_unit_op.cc | 14 ++-- paddle/fluid/operators/hinge_loss_op.cc | 10 +-- paddle/fluid/operators/im2sequence_op.cc | 10 +-- paddle/fluid/operators/inplace_abn_op.cc | 14 ++-- paddle/fluid/operators/interpolate_op.h | 12 +-- paddle/fluid/operators/inverse_op.cc | 14 ++-- paddle/fluid/operators/isfinite_op.cc | 83 ++++++------------- paddle/fluid/operators/l1_norm_op.cc | 8 +- paddle/fluid/operators/linear_chain_crf_op.cc | 12 ++- paddle/fluid/operators/linear_chain_crf_op.h | 25 +++--- .../operators/lite/lite_engine_op_test.cc | 2 +- paddle/fluid/operators/lite/ut_helper.h | 2 +- paddle/fluid/operators/load_combine_op.cc | 13 ++- paddle/fluid/operators/load_op.cc | 13 ++- .../fluid/operators/lod_tensor_to_array_op.cc | 2 +- paddle/fluid/operators/lookup_table_op.h | 6 +- paddle/fluid/operators/lookup_table_v2_op.h | 3 +- paddle/fluid/operators/lrn_op.cc | 24 +++--- paddle/fluid/operators/lstm_op.cc | 14 ++-- paddle/fluid/operators/lstmp_op.cc | 14 ++-- paddle/fluid/operators/lstsq_op.cc | 7 +- paddle/fluid/operators/lu_op.cc | 8 +- paddle/fluid/operators/lu_unpack_op.cc | 11 ++- paddle/fluid/operators/margin_rank_loss_op.cc | 10 +-- .../fluid/operators/match_matrix_tensor_op.cc | 12 ++- .../fluid/operators/math/beam_search_test.cc | 3 +- .../fluid/operators/math/concat_and_split.cc | 14 ++-- paddle/fluid/operators/math/concat_test.cc | 3 +- .../fluid/operators/math/cos_sim_functor.cc | 8 +- .../operators/math/eigen_values_vectors.h | 5 +- paddle/fluid/operators/math/gru_compute.cc | 40 ++++----- paddle/fluid/operators/math/im2col_test.cc | 6 +- .../fluid/operators/math/matrix_bit_code.cc | 9 +- paddle/fluid/operators/math/matrix_solve.cc | 10 +-- .../operators/math/selected_rows_functor.cc | 64 +++++++------- .../math/selected_rows_functor_test.cc | 74 +++++++---------- .../operators/math/sequence_padding_test.cc | 8 +- .../fluid/operators/math/sequence_pooling.cc | 34 ++++---- .../operators/math/sequence_pooling_test.cc | 8 +- paddle/fluid/operators/math/softmax_impl.h | 2 +- paddle/fluid/operators/math/squared_l2_norm.h | 2 +- paddle/fluid/operators/math/tree2col.cc | 20 ++--- paddle/fluid/operators/math/unpooling.cc | 32 +++---- paddle/fluid/operators/math/vol2col_test.cc | 2 +- paddle/fluid/operators/matmul_op.cc | 23 +++-- paddle/fluid/operators/mean_iou_op.h | 4 +- .../fluid/operators/merge_selected_rows_op.cc | 7 +- paddle/fluid/operators/minus_op.cc | 3 +- .../operators/mkldnn/reshape_mkldnn_op.cc | 3 +- .../fluid/operators/mkldnn/sum_mkldnn_op.cc | 2 +- .../fluid/operators/modified_huber_loss_op.cc | 5 +- paddle/fluid/operators/norm_op.cc | 2 +- paddle/fluid/operators/one_hot_op.cc | 7 +- .../optimizers/decayed_adagrad_op.cc | 5 +- .../operators/optimizers/dgc_momentum_op.cc | 5 +- .../distributed_fused_lamb_init_op.cc | 2 +- .../optimizers/distributed_fused_lamb_op.cc | 2 +- paddle/fluid/operators/optimizers/dpsgd_op.cc | 7 +- paddle/fluid/operators/optimizers/ftrl_op.cc | 3 +- paddle/fluid/operators/optimizers/lamb_op.cc | 7 +- .../operators/optimizers/merged_adam_op.cc | 7 +- .../optimizers/merged_momentum_op.cc | 7 +- .../optimizers/mkldnn/sgd_mkldnn_op.cc | 2 +- .../pow2_decay_with_linear_warmup_op.cc | 4 +- .../optimizers/proximal_adagrad_op.cc | 5 +- .../operators/optimizers/proximal_gd_op.cc | 5 +- paddle/fluid/operators/optimizers/sgd_op.h | 3 +- .../optimizers/sparse_momentum_op.cc | 7 +- paddle/fluid/operators/overlap_add_op.cc | 26 +++--- paddle/fluid/operators/p_norm_op.cc | 2 +- paddle/fluid/operators/pad2d_op.cc | 4 +- .../fluid/operators/pad_constant_like_op.cc | 20 ++--- paddle/fluid/operators/partial_concat_op.cc | 11 ++- paddle/fluid/operators/partial_concat_op.h | 4 +- paddle/fluid/operators/partial_sum_op.cc | 11 ++- paddle/fluid/operators/partial_sum_op.h | 4 +- paddle/fluid/operators/pool_op_mlu.cc | 4 +- paddle/fluid/operators/prroi_pool_op.cc | 22 +++-- .../operators/prune_gate_by_capacity_op.cc | 5 +- .../pscore/distributed_lookup_table_op.cc | 3 +- .../pscore/distributed_push_sparse_op.cc | 5 +- .../pscore/heter_cloud_comm_cpu_test.cc | 6 +- .../pscore/heter_listen_and_server_test.cc | 4 +- .../operators/pscore/heter_server_test.cc | 4 +- .../operators/pscore/send_and_recv_op.cc | 11 ++- .../pscore/send_and_recv_op_cpu_test.cc | 4 +- .../pscore/send_and_recv_op_gpu_test.cc | 2 +- .../operators/pscore/switch_server_test.cc | 2 +- paddle/fluid/operators/py_layer_op.cc | 30 +++---- paddle/fluid/operators/pyramid_hash_op.cc | 12 ++- paddle/fluid/operators/qr_op.cc | 7 +- paddle/fluid/operators/quantize_linear_op.cc | 10 +-- paddle/fluid/operators/random_crop_op.cc | 2 +- paddle/fluid/operators/random_crop_op.h | 4 +- paddle/fluid/operators/rank_attention_op.cc | 7 +- paddle/fluid/operators/rank_loss_op.cc | 8 +- .../operators/reduce_ops/reduce_amax_op.cc | 34 +++----- .../operators/reduce_ops/reduce_amin_op.cc | 34 +++----- .../fluid/operators/repeat_interleave_op.cc | 20 ++--- paddle/fluid/operators/reshape_op.cc | 6 +- paddle/fluid/operators/row_conv_op.cc | 14 ++-- paddle/fluid/operators/run_program_op.cc | 10 +-- paddle/fluid/operators/sample_logits_op.h | 11 +-- paddle/fluid/operators/save_combine_op.cc | 11 ++- paddle/fluid/operators/save_op.cc | 20 ++--- paddle/fluid/operators/scatter_test.cc | 2 +- paddle/fluid/operators/search_compute.h | 2 +- paddle/fluid/operators/seed_op.cc | 3 +- paddle/fluid/operators/seed_op.cu | 4 +- .../sequence_ops/sequence_concat_op.cc | 22 +++-- .../sequence_ops/sequence_conv_op.cc | 14 ++-- .../sequence_ops/sequence_enumerate_op.cc | 7 +- .../sequence_ops/sequence_erase_op.cc | 7 +- .../sequence_ops/sequence_expand_as_op.cc | 20 ++--- .../sequence_ops/sequence_expand_as_op.h | 8 +- .../sequence_ops/sequence_expand_op.cc | 22 +++-- .../sequence_ops/sequence_expand_op.h | 10 +-- .../sequence_ops/sequence_mask_op.cc | 12 +-- .../operators/sequence_ops/sequence_pad_op.cc | 22 +++-- .../sequence_ops/sequence_pool_op.cc | 16 ++-- .../sequence_ops/sequence_reshape_op.cc | 22 +++-- .../sequence_ops/sequence_reverse_op.cc | 13 ++- .../sequence_ops/sequence_slice_op.cc | 20 ++--- .../sequence_ops/sequence_softmax_op.cc | 14 ++-- .../sequence_ops/sequence_softmax_op.h | 8 +- .../sequence_topk_avg_pooling_op.cc | 6 +- .../sequence_topk_avg_pooling_op.h | 5 +- .../sequence_ops/sequence_unpad_op.cc | 20 ++--- paddle/fluid/operators/shuffle_channel_op.cc | 12 ++- paddle/fluid/operators/slice_op.cc | 38 ++++----- paddle/fluid/operators/smooth_l1_loss_op.cc | 10 +-- paddle/fluid/operators/solve_op.cc | 14 ++-- paddle/fluid/operators/space_to_depth_op.cc | 22 +++-- paddle/fluid/operators/spectral_helper.h | 38 ++++----- paddle/fluid/operators/spectral_norm_op.cc | 14 ++-- paddle/fluid/operators/spectral_op.cc | 42 ++++------ paddle/fluid/operators/spp_op.cc | 14 ++-- .../fluid/operators/squared_l2_distance_op.cc | 8 +- paddle/fluid/operators/squared_l2_norm_op.cc | 14 ++-- paddle/fluid/operators/squeeze_op.cc | 46 +++++----- paddle/fluid/operators/stft_op.cc | 16 ++-- paddle/fluid/operators/strided_memcpy_test.cc | 4 +- paddle/fluid/operators/sum_op.cc | 11 ++- paddle/fluid/operators/svd_op.cc | 7 +- paddle/fluid/operators/svd_op.h | 8 +- .../test_leaky_relu_grad_grad_functor.h | 5 +- paddle/fluid/operators/tree_conv_op.cc | 16 ++-- .../fluid/operators/unique_consecutive_op.cc | 11 ++- paddle/fluid/operators/unpool_op.cc | 28 +++---- paddle/fluid/operators/unsqueeze_op.cc | 49 +++++------ paddle/fluid/operators/var_conv_2d_op.cc | 15 ++-- .../platform/device/npu/npu_op_runner.cc | 8 +- paddle/fluid/platform/device_context.cc | 4 +- paddle/fluid/platform/device_context.h | 5 +- paddle/fluid/platform/transform_test.cu | 10 +-- paddle/fluid/pybind/pybind.cc | 2 +- paddle/fluid/pybind/tensor_py.h | 13 ++- paddle/phi/kernels/funcs/gru_compute.cc | 26 +++--- paddle/phi/kernels/funcs/math_function.cc | 18 ++-- paddle/phi/kernels/funcs/math_function_impl.h | 12 +-- paddle/phi/kernels/funcs/sequence2batch.cc | 21 ++--- paddle/phi/tests/common/test_scalar.cu | 1 + .../phi/tests/kernels/test_math_function.cc | 20 ++--- .../tests/custom_op/custom_raw_op_kernel_op.h | 2 +- 289 files changed, 1525 insertions(+), 1909 deletions(-) diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 4262161b1bc45..dda5f2eee6e8f 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -251,7 +251,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) { "Please recompile or reinstall Paddle with NCCL support.")); #endif } else if (platform::is_cpu_place(place)) { - auto *default_ctx = static_cast( + auto *default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); ConcatTensorsWithType( *default_ctx, dense_tensors_, &dense_contents_, dtype_); @@ -274,7 +274,7 @@ void EagerGroup::SplitTensors(const platform::Place &place) { "Please recompile or reinstall Paddle with NCCL support.")); #endif } else if (platform::is_cpu_place(place)) { - auto *default_ctx = static_cast( + auto *default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); SplitTensorsWithType( *default_ctx, &dense_contents_, &dense_tensors_, dtype_); @@ -891,7 +891,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, "Please recompile or reinstall Paddle with NCCL support.")); #endif } else if (platform::is_cpu_place(inner_place_)) { - dev_ctx = static_cast( + dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(inner_place_)); } else { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h index 847e85a13415d..1abfb57b99dab 100644 --- a/paddle/fluid/distributed/common/utils.h +++ b/paddle/fluid/distributed/common/utils.h @@ -31,9 +31,9 @@ namespace paddle { namespace distributed { template -inline phi::funcs::BlasT GetBlas() { - paddle::platform::CPUDeviceContext cpu_ctx; - return phi::funcs::GetBlas(cpu_ctx); +inline phi::funcs::BlasT GetBlas() { + phi::CPUContext cpu_ctx; + return phi::funcs::GetBlas(cpu_ctx); } template diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc index 990dbc845f0ad..0856c81121f89 100644 --- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc @@ -353,11 +353,12 @@ void Communicator::RpcRecvSparse(const std::string &varname, bool training = true; - auto status = _worker_ptr->PullSparseParam((float **)push_g_vec.data(), - table_id, // NOLINT - sparse_push_keys.data(), - sparse_push_keys.size(), - training); + auto status = + _worker_ptr->PullSparseParam(static_cast(push_g_vec.data()), + table_id, + sparse_push_keys.data(), + sparse_push_keys.size(), + training); status.wait(); return; } @@ -1184,12 +1185,12 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) { auto &t_latest = var_latest->Get(); auto t_timestamp = var_timestamp->GetMutable(); - paddle::platform::CPUDeviceContext cpu_ctx; + phi::CPUContext cpu_ctx; auto *var_delta = delta_scope_->Var(varname); auto *t_delta = var_delta->GetMutable(); t_delta->mutable_data(t_latest.dims(), cpu_ctx.GetPlace()); - auto blas = phi::funcs::GetBlas(cpu_ctx); + auto blas = phi::funcs::GetBlas(cpu_ctx); blas.VSUB(t_latest.numel(), t_latest.data(), t_timestamp->data(), @@ -1218,7 +1219,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) { RpcRecvDense(varnames, table_id, pserver_scope_.get()); // 2.1 pserver - old => delta; 2.2 latest + old => latest 2.3 old => pserver - paddle::platform::CPUDeviceContext cpu_ctx; + phi::CPUContext cpu_ctx; for (auto &varname : varnames) { auto *var_latest = recv_scope_->FindVar(varname); auto t_latest = var_latest->GetMutable(); @@ -1233,7 +1234,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) { auto *t_delta = var_delta->GetMutable(); t_delta->mutable_data(t_latest->dims(), cpu_ctx.GetPlace()); - auto blas = phi::funcs::GetBlas(cpu_ctx); + auto blas = phi::funcs::GetBlas(cpu_ctx); blas.VSUB(t_latest->numel(), t_pserver.data(), t_old->data(), @@ -1334,7 +1335,7 @@ void GeoCommunicator::SendSparse(const std::string &varname, auto *t_old = var_old->GetMutable(); auto dims1 = t_latest.dims()[1]; - paddle::platform::CPUDeviceContext cpu_ctx; + phi::CPUContext cpu_ctx; auto *var_delta = delta_scope_->Var(varname); auto *t_delta = var_delta->GetMutable(); @@ -1345,7 +1346,7 @@ void GeoCommunicator::SendSparse(const std::string &varname, t_delta->set_rows(sparse_ids); t_delta->set_height(t_latest.dims()[0]); - auto blas = phi::funcs::GetBlas(cpu_ctx); + auto blas = phi::funcs::GetBlas(cpu_ctx); float coefficient = 1.0 / static_cast(trainers_); std::vector push_g_vec; @@ -1419,8 +1420,8 @@ void GeoCommunicator::RecvSparse(const std::string &varname, std::vector v_delta; v_delta.resize(numel); - paddle::platform::CPUDeviceContext cpu_ctx; - auto blas = phi::funcs::GetBlas(cpu_ctx); + phi::CPUContext cpu_ctx; + auto blas = phi::funcs::GetBlas(cpu_ctx); for (auto j = 0; j < static_cast(keys.size()); ++j) { VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j] diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h index 69589da8b3031..f08208ed02d70 100644 --- a/paddle/fluid/distributed/ps/service/communicator/communicator.h +++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h @@ -185,9 +185,8 @@ inline void MergeVars(const std::string &var_name, } // set output tensor to 0. - paddle::platform::CPUDeviceContext cpu_ctx; - phi::funcs::SetConstant - constant_functor; + phi::CPUContext cpu_ctx; + phi::funcs::SetConstant constant_functor; constant_functor(cpu_ctx, out_t, static_cast(0)); // sum all vars to out auto result = EigenVector::Flatten(*out_t); @@ -210,16 +209,13 @@ inline void MergeVars(const std::string &var_name, for (auto &var : vars) { inputs.push_back(&var->Get()); } - paddle::platform::CPUDeviceContext dev_ctx; + phi::CPUContext dev_ctx; if (merge_add) { - paddle::operators::math::scatter:: - MergeAdd - merge_add; + paddle::operators::math::scatter::MergeAdd merge_add; merge_add(dev_ctx, inputs, out_slr); } else { - paddle::operators::math::scatter:: - MergeAverage - merge_average; + paddle::operators::math::scatter::MergeAverage + merge_average; merge_average(dev_ctx, inputs, out_slr); } diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc index 1d45ef696b880..6b2b9c9f34a6d 100644 --- a/paddle/fluid/eager/nan_inf_utils.cc +++ b/paddle/fluid/eager/nan_inf_utils.cc @@ -48,8 +48,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) { #endif return; } - paddle::framework::details::tensor_check< - paddle::platform::CPUDeviceContext>( + paddle::framework::details::tensor_check( api_name, tensor_name, *dense_tensor, place); } } diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index a3a5a25eec842..94e7918e800ef 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -90,9 +90,8 @@ REGISTER_OP_WITHOUT_GRADIENT( test_op, paddle::framework::TestOpWithKernel, paddle::framework::OpKernelTestProtoAndCheckerMaker); -REGISTER_OP_CPU_KERNEL( - test_op, - paddle::framework::TestKernel); +REGISTER_OP_CPU_KERNEL(test_op, + paddle::framework::TestKernel); REGISTER_OP_CUDA_KERNEL( test_op, paddle::framework::TestKernel); diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 90639255c3aab..4bf81b46b3456 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -44,8 +44,8 @@ void CastDataLayout::apply() { auto place = ctx_->GetPlace(); if (platform::is_cpu_place(place)) { - phi::funcs::Transpose trans4; - auto* context = static_cast(ctx_); + phi::funcs::Transpose trans4; + auto* context = static_cast(ctx_); trans4(*context, in_, out_, axis_); } else { PADDLE_THROW(platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 2576df3483412..9333e246c68bc 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -94,8 +94,8 @@ struct CastDataType { auto* out_begin = out_->mutable_data(in_.place()); if (platform::is_cpu_place(in_.place())) { - platform::Transform trans; - auto* context = static_cast(ctx_); + platform::Transform trans; + auto* context = static_cast(ctx_); trans(*context, in_begin, in_end, diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 9c666d00ab9d1..26ad71bafe6ff 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -117,7 +117,7 @@ struct TestBroadcastOpHandle { for (int i = 0; i < count; ++i) { auto p = p::CPUPlace(); place_list_.push_back(p); - ctxs_.emplace_back(new p::CPUDeviceContext(p)); + ctxs_.emplace_back(new phi::CPUContext(p)); } #if defined(PADDLE_WITH_XPU_BKCL) bkcl_ctxs_.reset(nullptr); diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index ea63595cb2cfc..9cc1929e19ae8 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -69,7 +69,7 @@ struct TestGatherOpHandle { for (int i = 0; i < count; ++i) { auto p = p::CPUPlace(); gpu_list_.push_back(p); - ctxs_.emplace_back(new p::CPUDeviceContext(p)); + ctxs_.emplace_back(new phi::CPUContext(p)); } } } diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index cce26f1e0dca1..767f7b1e48b43 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -316,7 +316,7 @@ template <> template <> template -void TensorCheckerVisitor::apply( +void TensorCheckerVisitor::apply( typename std::enable_if< std::is_floating_point::value || std::is_same>::value || @@ -329,11 +329,11 @@ void TensorCheckerVisitor::apply( } template <> -void tensor_check(const std::string& op_type, - const std::string& var_name, - const framework::Tensor& tensor, - const platform::Place& place) { - TensorCheckerVisitor vistor( +void tensor_check(const std::string& op_type, + const std::string& var_name, + const framework::Tensor& tensor, + const platform::Place& place) { + TensorCheckerVisitor vistor( op_type, var_name, tensor, place); VisitDataType(framework::TransToProtoVarType(tensor.dtype()), vistor); } @@ -439,7 +439,7 @@ void CheckVarHasNanOrInf(const std::string& op_type, #endif return; } - tensor_check(op_type, var_name, *tensor, place); + tensor_check(op_type, var_name, *tensor, place); } void CheckVarHasNanOrInf(const std::string& op_type, diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 2a7ac790e8049..0d957bf81306f 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -81,7 +81,7 @@ struct TestReduceOpHandle { for (int i = 0; i < count; ++i) { auto p = p::CPUPlace(); gpu_list_.push_back(p); - ctxs_.emplace_back(new p::CPUDeviceContext(p)); + ctxs_.emplace_back(new phi::CPUContext(p)); } #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) nccl_ctxs_.reset(nullptr); diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index c0b4ac864cabc..33d293faad129 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -144,7 +144,7 @@ LoDTensor LodExpand(const LoDTensor& source, auto slice = tensor.Slice(elem, elem + 1); TensorCopy(source.Slice(ins, ins + 1), platform::CPUPlace(), - platform::CPUDeviceContext(), + phi::CPUContext(), &slice); } } diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc index a95bcdf944943..fa0528d488297 100644 --- a/paddle/fluid/framework/op_registry_test.cc +++ b/paddle/fluid/framework/op_registry_test.cc @@ -232,9 +232,8 @@ class OpKernelTest : public paddle::framework::OpKernel { REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel, paddle::framework::OpWithKernelTest, paddle::framework::OpKernelTestMaker); -REGISTER_OP_CPU_KERNEL( - op_with_kernel, - paddle::framework::OpKernelTest); +REGISTER_OP_CPU_KERNEL(op_with_kernel, + paddle::framework::OpKernelTest); REGISTER_OP_CUDA_KERNEL( op_with_kernel, @@ -264,10 +263,9 @@ TEST(OperatorRegistrar, CUDA) { } static int op_test_value = 0; - -using paddle::platform::CPUDeviceContext; using paddle::platform::CUDADeviceContext; using paddle::platform::DeviceContext; +using phi::CPUContext; namespace paddle { namespace framework { @@ -295,8 +293,7 @@ class OpMultiKernelTest : public paddle::framework::OpKernel { }; template -class OpMultiKernelTest - : public paddle::framework::OpKernel { +class OpMultiKernelTest : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const { ++op_test_value; @@ -319,7 +316,7 @@ class OpMultiKernelTest2 : public paddle::framework::OpKernel { }; template -class OpMultiKernelTest2 +class OpMultiKernelTest2 : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const { @@ -342,16 +339,14 @@ class OpMultiKernelTest2 REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel, paddle::framework::OpWithMultiKernelTest, paddle::framework::OpKernelTestMaker); -REGISTER_OP_KERNEL( - op_with_multi_kernel, - CPU, - paddle::platform::CPUPlace, - paddle::framework::OpMultiKernelTest); -REGISTER_OP_KERNEL( - op_with_multi_kernel, - MKLDNN, - paddle::platform::CPUPlace, - paddle::framework::OpMultiKernelTest2); +REGISTER_OP_KERNEL(op_with_multi_kernel, + CPU, + paddle::platform::CPUPlace, + paddle::framework::OpMultiKernelTest); +REGISTER_OP_KERNEL(op_with_multi_kernel, + MKLDNN, + paddle::platform::CPUPlace, + paddle::framework::OpMultiKernelTest2); REGISTER_OP_KERNEL( op_with_multi_kernel, CUDA, diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index d4dfd165259a2..ba7a5956ae0fd 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -420,16 +420,13 @@ REGISTER_OP_WITHOUT_GRADIENT( REGISTER_OP_CPU_KERNEL( indicate_lod_tensor_data_type_test, - paddle::framework::EmptyTestKernel); + paddle::framework::EmptyTestKernel); REGISTER_OP_CPU_KERNEL( indicate_selected_rows_data_type_test, - paddle::framework::EmptyTestKernel); + paddle::framework::EmptyTestKernel); REGISTER_OP_CPU_KERNEL( indicate_other_data_type_test, - paddle::framework::EmptyTestKernel); + paddle::framework::EmptyTestKernel); TEST(IndicateVarDataTypeTest, lodtensor) { paddle::framework::InitDevices(); @@ -599,16 +596,14 @@ REGISTER_OP_WITHOUT_GRADIENT(get_lod_level_test, paddle::framework::GetSetLoDLevelTestMaker); REGISTER_OP_CPU_KERNEL( get_lod_level_test, - paddle::framework::EmptyTestKernel); + paddle::framework::EmptyTestKernel); REGISTER_OP_WITHOUT_GRADIENT(set_lod_level_test, paddle::framework::SetLoDLevelTest, paddle::framework::GetSetLoDLevelTestMaker); REGISTER_OP_CPU_KERNEL( set_lod_level_test, - paddle::framework::EmptyTestKernel); + paddle::framework::EmptyTestKernel); void SetGetLoDLevelTestMain(std::string op_type) { paddle::framework::InitDevices({}); diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h index 535672f2e1288..6c8e825157973 100644 --- a/paddle/fluid/framework/phi_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -66,7 +66,7 @@ struct ConvertToPhiContext { }; template <> -struct ConvertToPhiContext { +struct ConvertToPhiContext { using TYPE = phi::CPUContext; }; diff --git a/paddle/fluid/framework/selected_rows_utils_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc index db2c6c1f991b7..340acf53efa9d 100644 --- a/paddle/fluid/framework/selected_rows_utils_test.cc +++ b/paddle/fluid/framework/selected_rows_utils_test.cc @@ -53,7 +53,7 @@ TEST_F(SelectedRowsTester, complete_dims) { TEST_F(SelectedRowsTester, SerializeAndDeseralize) { phi::SelectedRows dst_tensor; - platform::CPUDeviceContext cpu_ctx(place_); + phi::CPUContext cpu_ctx(place_); std::ostringstream oss; SerializeToStream(oss, *selected_rows_, cpu_ctx); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 2fe2b87fcd4ae..dd80458b624c6 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -1253,7 +1253,7 @@ void TensorFromStream(std::istream& is, is.seekg(seekg, is.cur); void* buf; - platform::CPUDeviceContext ctx; + phi::CPUContext ctx; size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || @@ -1336,7 +1336,7 @@ void TensorFromStream(std::istream& is, std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); tensor->Resize(phi::make_ddim(dims)); void* buf; - platform::CPUDeviceContext ctx; + phi::CPUContext ctx; size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) || diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index 20fab1d20b0c0..74454a5a09b7a 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -24,7 +24,7 @@ namespace framework { TEST(TensorCopy, Tensor) { Tensor src_tensor; Tensor dst_tensor; - platform::CPUDeviceContext cpu_ctx((platform::CPUPlace())); + phi::CPUContext cpu_ctx((platform::CPUPlace())); int* src_ptr = src_tensor.mutable_data(phi::make_ddim({3, 3}), platform::CPUPlace()); @@ -164,7 +164,7 @@ TEST(TensorFromVector, Tensor) { // Copy to CPU Tensor cpu_tensor.Resize(phi::make_ddim({3, 3})); auto cpu_place = new paddle::platform::CPUPlace(); - paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place); + phi::CPUContext cpu_ctx(*cpu_place); paddle::framework::TensorFromVector(src_vec, cpu_ctx, &cpu_tensor); // Copy to GPUTensor @@ -255,20 +255,23 @@ TEST(TensorToVector, Tensor) { #endif } -TEST(TensorToVector, Tensor_bool){{paddle::framework::Tensor src; -bool* src_ptr = src.mutable_data({3, 3}, paddle::platform::CPUPlace()); -for (int i = 0; i < 3 * 3; ++i) { - src_ptr[i] = static_cast(i % 2); -} +TEST(TensorToVector, Tensor_bool) { +{ + paddle::framework::Tensor src; + bool* src_ptr = src.mutable_data({3, 3}, paddle::platform::CPUPlace()); + for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = static_cast(i % 2); + } -paddle::platform::CPUPlace place; -std::vector dst; -paddle::framework::TensorToVector(src, &dst); + paddle::platform::CPUPlace place; + std::vector dst; + paddle::framework::TensorToVector(src, &dst); -for (int i = 0; i < 3 * 3; ++i) { - EXPECT_EQ(src_ptr[i], dst[i]); + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); + } } -} // namespace framework + #ifdef PADDLE_WITH_CUDA { std::vector src_vec = { @@ -325,7 +328,7 @@ for (int i = 0; i < 3 * 3; ++i) { } } #endif -} // namespace paddle +} TEST(TensorFromDLPack, Tensor) { { @@ -334,7 +337,7 @@ TEST(TensorFromDLPack, Tensor) { cpu_tensor.Resize(phi::make_ddim({3, 3})); paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext cpu_ctx(cpu_place); + phi::CPUContext cpu_ctx(cpu_place); paddle::framework::TensorFromVector(src_vec, cpu_ctx, &cpu_tensor); paddle::framework::DLPackTensor dlpack_tensor(cpu_tensor, 1); @@ -360,7 +363,7 @@ TEST(TensorFromDLPack, Tensor) { // Copy to CPU Tensor cpu_tensor.Resize(phi::make_ddim({3, 3})); paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext cpu_ctx(cpu_place); + phi::CPUContext cpu_ctx(cpu_place); paddle::framework::TensorFromVector(src_vec, cpu_ctx, &cpu_tensor); // Copy to GPUTensor @@ -502,7 +505,7 @@ TEST(Tensor, FromAndToStream) { { framework::Tensor dst_tensor; auto place = new platform::CPUPlace(); - platform::CPUDeviceContext cpu_ctx(*place); + phi::CPUContext cpu_ctx(*place); std::ostringstream oss; TensorToStream(oss, src_tensor, cpu_ctx); diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index dd263f0f8f2fe..b6c21bead4182 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -46,8 +46,8 @@ void GLOOParallelContext::Init() { int port = std::stoi(addr[1]); gloo_wrapper->SetHttpStore(host, port, "worker"); gloo_wrapper->Init(); - device_ = std::unique_ptr( - new platform::CPUDeviceContext(platform::CPUPlace())); + device_ = std::unique_ptr( + new phi::CPUContext(platform::CPUPlace())); device_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CPUPlace()) .get()); @@ -200,7 +200,7 @@ void GLOOParallelContext::Broadcast(framework::Variable *src, int ring_id) { paddle::platform::DeviceContext *GLOOParallelContext::GetDeviceContext( int ring_id) { - // return the CPUDeviceContext + // return the CPUContext return device_.get(); } diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h index 85aacc0d3f77b..5290e3d1315a4 100644 --- a/paddle/fluid/imperative/gloo_context.h +++ b/paddle/fluid/imperative/gloo_context.h @@ -64,7 +64,7 @@ class GLOOParallelContext : public ParallelContext { void AllReduce(const phi::SelectedRows& src, phi::SelectedRows* dst); private: - std::unique_ptr device_; + std::unique_ptr device_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index ba60c834f79ae..4a8fc6a5d546c 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -85,9 +85,9 @@ class TensorAddFunctor : public boost::static_visitor<> { : numel_(numel), x_(x), y_(y) {} void operator()(const platform::CPUPlace& place) const { - platform::CPUDeviceContext* ctx = dynamic_cast( + phi::CPUContext* ctx = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)); - auto blas = phi::funcs::GetBlas(*ctx); + auto blas = phi::funcs::GetBlas(*ctx); blas.AXPY(numel_, 1., x_, y_); } @@ -438,7 +438,7 @@ void TensorAdd(const VarType& src, VarType* dst) { place)); #endif } else if (platform::is_cpu_place(place)) { - return TensorAddImpl( + return TensorAddImpl( src_tensor, dst_tensor, place); } } @@ -455,7 +455,7 @@ void TensorAdd(const VarType& src, VarType* dst) { place)); #endif } else if (platform::is_cpu_place(place)) { - return TensorAddImpl( + return TensorAddImpl( src_tensor, dst_tensor, place); } } @@ -498,8 +498,8 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) { PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double); } else { #endif - PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, float); - PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, double); + PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float); + PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif @@ -550,8 +550,8 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var, PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double); } else { #endif - PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, float); - PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, double); + PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float); + PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif @@ -613,8 +613,8 @@ std::shared_ptr SelectedRowsMerge(const VarType& src1, PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double); } else { #endif - PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, float); - PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, double); + PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, float); + PADDLE_SELECTED_ROWS_ADD(phi::CPUContext, double); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } #endif diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index f06ed80a940f0..9dd61b6f5e3cd 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -53,12 +53,11 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { } framework::VisitDataTypeForHIP( dtype_, - DivNRanksForAllReduce( - tensor, nranks, context)); + DivNRanksForAllReduce(tensor, nranks, context)); #else - framework::VisitDataType(dtype_, - DivNRanksForAllReduce( - tensor, nranks, context)); + framework::VisitDataType( + dtype_, + DivNRanksForAllReduce(tensor, nranks, context)); #endif VLOG(4) << "after div 2" << *tensor; } else if (platform::is_xpu_place(tensor->place())) { @@ -328,11 +327,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { - ConcatTensorsWithType( - static_cast(context), - dense_tensors_, - &dense_contents_, - dtype_); + ConcatTensorsWithType(static_cast(context), + dense_tensors_, + &dense_contents_, + dtype_); } else { PADDLE_THROW(platform::errors::Unimplemented( "Concat grad tensor not supported on place (%s)", place)); @@ -390,11 +388,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) { "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { - SplitTensorsWithType( - static_cast(context), - &dense_contents_, - &dense_tensors_, - dtype_); + SplitTensorsWithType(static_cast(context), + &dense_contents_, + &dense_tensors_, + dtype_); } else { PADDLE_THROW(platform::errors::Unimplemented( "Split grad tensor not supported on place (%s)", place)); diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index 4ba17aa126dc6..e2108278b15c5 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -234,7 +234,7 @@ void LiteSubgraphPass::SetUpEngine( framework::Scope* scope, const std::vector& params) { std::ostringstream os; - platform::CPUDeviceContext ctx; + phi::CPUContext ctx; for (const auto& param : params) { VLOG(3) << "Serialize param: " << param; PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index 6b6651678f85e..44e36647646fe 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -365,7 +365,7 @@ void ConvertToMixedPrecision(const std::string& model_file, [](framework::Scope* scope, const std::vector& params) -> std::string { std::ostringstream os; - platform::CPUDeviceContext ctx; + phi::CPUContext ctx; for (const auto& param : params) { VLOG(3) << "Serialize param: " << param; PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc index dee83f70ba2a2..45b9d222c4c3e 100644 --- a/paddle/fluid/inference/lite/test_engine_lite.cc +++ b/paddle/fluid/inference/lite/test_engine_lite.cc @@ -81,7 +81,7 @@ void make_fake_model(std::string* model, std::string* param) { ctx.PartialInitWithAllocator(); #else platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); #endif // Prepare variables. std::vector repetitive_params{"x", "y"}; diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc index a2fe32b75f3de..d770ef5478abb 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc @@ -62,7 +62,7 @@ void IOConverterTester(const platform::DeviceContext& ctx) { TEST(EngineIOConverterTester, DefaultCPU) { platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); IOConverterTester(ctx); } diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 8f443f6f165e5..4a7f6cfbf0b31 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1469,20 +1469,16 @@ namespace plat = paddle::platform; ops::ActivationOpGrad, \ ops::ActivationGradOpInplaceInferer); -#define REGISTER_ACTIVATION_CPU_KERNEL( \ - act_type, op_name, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL( \ - act_type, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>); \ - REGISTER_OP_CPU_KERNEL( \ - act_type##_grad, \ - ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>); +#define REGISTER_ACTIVATION_CPU_KERNEL( \ + act_type, op_name, functor, grad_functor) \ + REGISTER_OP_CPU_KERNEL( \ + act_type, \ + ops::ActivationKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CPU_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc index f58a838460ec3..f4e7481bdd456 100644 --- a/paddle/fluid/operators/add_position_encoding_op.cc +++ b/paddle/fluid/operators/add_position_encoding_op.cc @@ -122,12 +122,11 @@ REGISTER_OPERATOR( ops::AddPositionEncodingGradOpMaker); REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad); -REGISTER_OP_CPU_KERNEL( - add_position_encoding, - ops::AddPositionEncodingKernel, - ops::AddPositionEncodingKernel); +REGISTER_OP_CPU_KERNEL(add_position_encoding, + ops::AddPositionEncodingKernel, + ops::AddPositionEncodingKernel); REGISTER_OP_CPU_KERNEL( add_position_encoding_grad, - ops::AddPositionEncodingGradKernel, - ops::AddPositionEncodingGradKernel); + ops::AddPositionEncodingGradKernel, + ops::AddPositionEncodingGradKernel); diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc index a72fd850d89bc..8c6360bfd89cf 100644 --- a/paddle/fluid/operators/affine_channel_op.cc +++ b/paddle/fluid/operators/affine_channel_op.cc @@ -342,7 +342,7 @@ DECLARE_INPLACE_OP_INFERER(AffineChannelGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp, diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index 9efa8d0f86385..1977a33fc197e 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -28,7 +28,7 @@ namespace operators { using Tensor = framework::Tensor; template -struct Linspace { +struct Linspace { void operator()(T start, T end, int count, @@ -282,14 +282,12 @@ REGISTER_OPERATOR(affine_grid, ops::AffineGridGradMaker); REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad); -REGISTER_OP_CPU_KERNEL( - affine_grid, - ops::AffineGridOpKernel, - ops::AffineGridOpKernel); -REGISTER_OP_CPU_KERNEL( - affine_grid_grad, - ops::AffineGridGradOpKernel, - ops::AffineGridGradOpKernel); +REGISTER_OP_CPU_KERNEL(affine_grid, + ops::AffineGridOpKernel, + ops::AffineGridOpKernel); +REGISTER_OP_CPU_KERNEL(affine_grid_grad, + ops::AffineGridGradOpKernel, + ops::AffineGridGradOpKernel); REGISTER_OP_VERSION(affine_grid) .AddCheckpoint( diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc index 16c712b4a2751..aa3cd5d4149c4 100644 --- a/paddle/fluid/operators/allclose_op.cc +++ b/paddle/fluid/operators/allclose_op.cc @@ -84,7 +84,7 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; DECLARE_INFER_SHAPE_FUNCTOR(allclose, AllcloseInferShapeFunctor, diff --git a/paddle/fluid/operators/amp/alloc_float_status_op.cc b/paddle/fluid/operators/amp/alloc_float_status_op.cc index c27f0f159f51e..fc96dd52e54a2 100644 --- a/paddle/fluid/operators/amp/alloc_float_status_op.cc +++ b/paddle/fluid/operators/amp/alloc_float_status_op.cc @@ -65,7 +65,7 @@ class AllocFloatStatusKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR( alloc_float_status, diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc index 20f986596063e..8fc582c19845c 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc @@ -95,7 +95,7 @@ template class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); const auto xs = ctx.MultiInput("X"); const auto* scale = ctx.Input("Scale"); auto outs = ctx.MultiOutput("Out"); @@ -106,11 +106,10 @@ class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel { *found_inf_data = false; framework::Tensor is_finite = - ctx.AllocateTmpTensor({1}, dev_ctx); + ctx.AllocateTmpTensor({1}, dev_ctx); bool* is_finite_data = is_finite.template data(); - auto& dev = *ctx.template device_context() - .eigen_device(); + auto& dev = *ctx.template device_context().eigen_device(); T inverse_scale = Inverse(*scale_data); for (size_t i = 0; i < xs.size(); ++i) { diff --git a/paddle/fluid/operators/amp/clear_float_status_op.cc b/paddle/fluid/operators/amp/clear_float_status_op.cc index beef807620592..7bfc2d34d296e 100644 --- a/paddle/fluid/operators/amp/clear_float_status_op.cc +++ b/paddle/fluid/operators/amp/clear_float_status_op.cc @@ -68,7 +68,7 @@ class ClearFloatStatusKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR( clear_float_status, diff --git a/paddle/fluid/operators/amp/get_float_status_op.cc b/paddle/fluid/operators/amp/get_float_status_op.cc index add662d8258eb..88a2affbcaaba 100644 --- a/paddle/fluid/operators/amp/get_float_status_op.cc +++ b/paddle/fluid/operators/amp/get_float_status_op.cc @@ -67,7 +67,7 @@ class GetFloatStatusKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR( get_float_status, diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc index 346e981b3a99b..3bae775d30817 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc @@ -169,9 +169,9 @@ decr_every_n_nan_or_inf steps and each step some gradients are infinite. }; template -class UpdateLossScalingFunctor { +class UpdateLossScalingFunctor { public: - void operator()(const platform::CPUDeviceContext& ctx, + void operator()(const phi::CPUContext& ctx, const bool* found_inf_data, const T* pre_loss_scaling_data, const int* good_in_data, @@ -203,9 +203,9 @@ class UpdateLossScalingFunctor { }; template -class LazyZeros { +class LazyZeros { public: - void operator()(const platform::CPUDeviceContext& dev_ctx, + void operator()(const phi::CPUContext& dev_ctx, const bool* found_inf_data, const std::vector& xs, const std::vector& outs) const { @@ -225,7 +225,7 @@ class LazyZeros { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR( update_loss_scaling, diff --git a/paddle/fluid/operators/angle_op.cc b/paddle/fluid/operators/angle_op.cc index ae483f39e7f94..f925c7fa74759 100644 --- a/paddle/fluid/operators/angle_op.cc +++ b/paddle/fluid/operators/angle_op.cc @@ -116,20 +116,16 @@ REGISTER_OPERATOR(angle, REGISTER_OP_CPU_KERNEL( angle, - ops::AngleKernel, - ops::AngleKernel, - ops::AngleKernel>, - ops::AngleKernel>); + ops::AngleKernel, + ops::AngleKernel, + ops::AngleKernel>, + ops::AngleKernel>); REGISTER_OPERATOR(angle_grad, ops::AngleGradOp); REGISTER_OP_CPU_KERNEL( angle_grad, - ops::AngleGradKernel, - ops::AngleGradKernel, - ops::AngleGradKernel>, - ops::AngleGradKernel>); + ops::AngleGradKernel, + ops::AngleGradKernel, + ops::AngleGradKernel>, + ops::AngleGradKernel>); diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index ef8ab38d2f35e..5b23ff604759a 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -51,7 +51,7 @@ struct ArrayToLoDFunctor : public boost::static_visitor { void operator()(Place place) const { auto &pool = platform::DeviceContextPool::Instance(); if (std::is_same::value) { - Apply(static_cast(pool.Get(place))); + Apply(static_cast(pool.Get(place))); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) Apply(static_cast(pool.Get(place))); diff --git a/paddle/fluid/operators/assign_op_test.cc b/paddle/fluid/operators/assign_op_test.cc index fd6a793ec4732..0b6245f17d38d 100644 --- a/paddle/fluid/operators/assign_op_test.cc +++ b/paddle/fluid/operators/assign_op_test.cc @@ -22,7 +22,7 @@ limitations under the License. */ TEST(AssignOp, AssignLoDTensor) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); + phi::CPUContext ctx(cpu_place); paddle::framework::Variable output; paddle::operators::AssignFunctor assign_functor(&output, ctx); @@ -47,7 +47,7 @@ TEST(AssignOp, AssignLoDTensor) { TEST(AssignOp, AssignLoDTensorArray) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); + phi::CPUContext ctx(cpu_place); paddle::framework::Variable output; paddle::operators::AssignFunctor assign_functor(&output, ctx); @@ -78,7 +78,7 @@ TEST(AssignOp, AssignLoDTensorArray) { TEST(AssignOp, AssignSelectedRows) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); + phi::CPUContext ctx(cpu_place); paddle::framework::Variable output; paddle::operators::AssignFunctor assign_functor(&output, ctx); diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index e0d6e38e73fec..60e5912c4418d 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -337,7 +337,7 @@ template class AttentionLSTMKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - using DeviceContext = paddle::platform::CPUDeviceContext; + using DeviceContext = phi::CPUContext; auto* x = ctx.Input("X"); auto* h0 = ctx.Input("H0"); @@ -416,10 +416,10 @@ class AttentionLSTMKernel : public framework::OpKernel { T* lstm_x_data = lstm_x->mutable_data(ctx.GetPlace()); T* lstm_out_data = lstm_out->mutable_data(ctx.GetPlace()); - auto blas = phi::funcs::GetBlas(ctx); + auto blas = phi::funcs::GetBlas(ctx); // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1 - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); phi::funcs::FCFunctor fc; fc(dev_ctx, total_T, diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc index bb1973f96aaea..856a703fd2b06 100644 --- a/paddle/fluid/operators/average_accumulates_op.cc +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -18,11 +18,10 @@ namespace paddle { namespace operators { template <> -void GetAccumulators( - const framework::ExecutionContext& ctx, - int64_t* num_updates, - int64_t* num_accumulates, - int64_t* old_num_accumulates) { +void GetAccumulators(const framework::ExecutionContext& ctx, + int64_t* num_updates, + int64_t* num_accumulates, + int64_t* old_num_accumulates) { auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); auto* in_num_accumulates = ctx.Input("in_num_accumulates"); auto* in_num_updates = ctx.Input("in_num_updates"); @@ -33,11 +32,10 @@ void GetAccumulators( } template <> -void SetAccumulators( - const framework::ExecutionContext& ctx, - int64_t num_updates, - int64_t num_accumulates, - int64_t old_num_accumulates) { +void SetAccumulators(const framework::ExecutionContext& ctx, + int64_t num_updates, + int64_t num_accumulates, + int64_t old_num_accumulates) { auto* out_old_num_accumulates = ctx.Output("out_old_num_accumulates"); auto* out_num_accumulates = ctx.Output("out_num_accumulates"); auto* out_num_updates = ctx.Output("out_num_updates"); @@ -217,7 +215,6 @@ REGISTER_OPERATOR( ops::AverageAccumulatesOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - average_accumulates, - ops::AverageAccumulatesKernel, - ops::AverageAccumulatesKernel); +REGISTER_OP_CPU_KERNEL(average_accumulates, + ops::AverageAccumulatesKernel, + ops::AverageAccumulatesKernel); diff --git a/paddle/fluid/operators/batch_fc_op.cc b/paddle/fluid/operators/batch_fc_op.cc index d8c11c04287c2..38504e3ecdf18 100644 --- a/paddle/fluid/operators/batch_fc_op.cc +++ b/paddle/fluid/operators/batch_fc_op.cc @@ -166,7 +166,6 @@ REGISTER_OPERATOR(batch_fc_grad, ops::BatchFCGradOp, ops::BatchFCGradOpNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - batch_fc, - ops::BatchFCKernel, - ops::BatchFCKernel); +REGISTER_OP_CPU_KERNEL(batch_fc, + ops::BatchFCKernel, + ops::BatchFCKernel); diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h index f30b4e2379ef7..2800ef3907407 100644 --- a/paddle/fluid/operators/beam_search_decode_op.h +++ b/paddle/fluid/operators/beam_search_decode_op.h @@ -141,7 +141,7 @@ void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( auto cpu_place = std::unique_ptr( new paddle::platform::CPUPlace()); - paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place); + phi::CPUContext cpu_ctx(*cpu_place); framework::LoD lod; lod.push_back(source_level_lod); diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index 3a2526fd52063..49ad3d166d908 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -143,9 +143,8 @@ REGISTER_OPERATOR(beam_search, ops::BeamSearchOp, ops::BeamSearchOpMaker, ops::BeamSearchInferVarType); -REGISTER_OP_CPU_KERNEL( - beam_search, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel, - ops::BeamSearchOpKernel); +REGISTER_OP_CPU_KERNEL(beam_search, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/bmm_op.cc b/paddle/fluid/operators/bmm_op.cc index a24d727e82727..8cacc3c4f2277 100644 --- a/paddle/fluid/operators/bmm_op.cc +++ b/paddle/fluid/operators/bmm_op.cc @@ -172,11 +172,9 @@ REGISTER_OPERATOR(bmm, ops::BmmOpGradMaker, ops::BmmOpGradMaker); REGISTER_OPERATOR(bmm_grad, ops::BmmOpGrad); -REGISTER_OP_CPU_KERNEL( - bmm, - ops::BmmKernel, - ops::BmmKernel); -REGISTER_OP_CPU_KERNEL( - bmm_grad, - ops::BmmGradKernel, - ops::BmmGradKernel); +REGISTER_OP_CPU_KERNEL(bmm, + ops::BmmKernel, + ops::BmmKernel); +REGISTER_OP_CPU_KERNEL(bmm_grad, + ops::BmmGradKernel, + ops::BmmGradKernel); diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index 7362a2e6530c3..20ea0b187f64e 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -176,7 +176,7 @@ class BprLossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPUCtx = paddle::platform::CPUDeviceContext; +using CPUCtx = phi::CPUContext; REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index 4245c336e5dcc..b3903da7c3f2a 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -141,7 +141,7 @@ class CastOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; // cast use phi kernel, so no need to REGISTER_OP_CPU_KERNEL here. REGISTER_OPERATOR(cast, diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc index a3a67177e2b47..15cc71565091c 100644 --- a/paddle/fluid/operators/center_loss_op.cc +++ b/paddle/fluid/operators/center_loss_op.cc @@ -146,7 +146,7 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(CenterLossGradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; -using CPUCtx = paddle::platform::CPUDeviceContext; +using CPUCtx = phi::CPUContext; REGISTER_OPERATOR(center_loss, ops::CenterLossOp, diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc index 644de7e191faa..b7600cbb4af41 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.cc @@ -113,13 +113,11 @@ It accomplishes the execution of the instruction according to the following step } // namespace paddle::operators namespace ops = paddle::operators; -using CPUDeviceContext = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR( cinn_instruction_run, ops::CinnInstructionRunOp, ops::CinnInstructionRunOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - cinn_instruction_run, - ops::CinnInstructionRunOpKernel); +REGISTER_OP_CPU_KERNEL(cinn_instruction_run, + ops::CinnInstructionRunOpKernel); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc index 4e0ed2cfb199c..cd0a31dc0cddd 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc @@ -189,6 +189,5 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); /* see [Why use single type kernel] */ -REGISTER_OP_CPU_KERNEL( - cinn_launch, - ops::CinnLaunchOpKernel); +REGISTER_OP_CPU_KERNEL(cinn_launch, + ops::CinnLaunchOpKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc index 9ce6f7bebc837..cfb56a4b2a6b1 100644 --- a/paddle/fluid/operators/clip_by_norm_op.cc +++ b/paddle/fluid/operators/clip_by_norm_op.cc @@ -19,6 +19,5 @@ REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp, ops::ClipByNormOpMaker); -REGISTER_OP_CPU_KERNEL( - clip_by_norm, - ops::ClipByNormKernel); +REGISTER_OP_CPU_KERNEL(clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index 5c0f5c39a34e5..561d2696fef85 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -511,11 +511,10 @@ REGISTER_OPERATOR(coalesce_tensor, paddle::operators::CoalesceTensorOpMaker); namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CPU_KERNEL( - coalesce_tensor, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel); +REGISTER_OP_CPU_KERNEL(coalesce_tensor, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( @@ -550,20 +549,18 @@ REGISTER_OP_XPU_KERNEL( #if defined(PADDLE_WITH_ASCEND_CL) REGISTER_OP_NPU_KERNEL( coalesce_tensor, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel); + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); #endif #if defined(PADDLE_WITH_MLU) REGISTER_OP_MLU_KERNEL( coalesce_tensor, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel, - ops::CoalesceTensorOpKernel); + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); #endif REGISTER_OP_VERSION(coalesce_tensor) diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc index 6c365292f54fd..b3351dc82b7e7 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cc @@ -73,10 +73,9 @@ REGISTER_OP_WITHOUT_GRADIENT(allreduce, ops::AllReduceOp, ops::AllReduceOpMaker); -REGISTER_OP_CPU_KERNEL( - allreduce, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel, - ops::AllReduceOpKernel); +REGISTER_OP_CPU_KERNEL(allreduce, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel, + ops::AllReduceOpKernel); diff --git a/paddle/fluid/operators/complex_op.cc b/paddle/fluid/operators/complex_op.cc index ebd0b2334529d..d6d93fe958118 100644 --- a/paddle/fluid/operators/complex_op.cc +++ b/paddle/fluid/operators/complex_op.cc @@ -143,12 +143,10 @@ REGISTER_OPERATOR(complex, REGISTER_OPERATOR(complex_grad, ops::ComplexGradOp); -REGISTER_OP_CPU_KERNEL( - complex, - ops::ComplexKernel, - ops::ComplexKernel); - -REGISTER_OP_CPU_KERNEL( - complex_grad, - ops::ComplexGradKernel, - ops::ComplexGradKernel); +REGISTER_OP_CPU_KERNEL(complex, + ops::ComplexKernel, + ops::ComplexKernel); + +REGISTER_OP_CPU_KERNEL(complex_grad, + ops::ComplexGradKernel, + ops::ComplexGradKernel); diff --git a/paddle/fluid/operators/complex_view_op.cc b/paddle/fluid/operators/complex_view_op.cc index 344b2a1c48ad4..6bdd2b48c4503 100644 --- a/paddle/fluid/operators/complex_view_op.cc +++ b/paddle/fluid/operators/complex_view_op.cc @@ -161,12 +161,10 @@ REGISTER_OPERATOR(as_real, ops::AsRealGradMaker, ops::AsRealGradMaker); -REGISTER_OP_CPU_KERNEL( - as_complex, - ops::AsComplexKernel, - ops::AsComplexKernel); - -REGISTER_OP_CPU_KERNEL( - as_real, - ops::AsRealKernel, - ops::AsRealKernel); +REGISTER_OP_CPU_KERNEL(as_complex, + ops::AsComplexKernel, + ops::AsComplexKernel); + +REGISTER_OP_CPU_KERNEL(as_real, + ops::AsRealKernel, + ops::AsRealKernel); diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc index e6ecf6675cba9..e3228104de38b 100644 --- a/paddle/fluid/operators/cos_sim_op.cc +++ b/paddle/fluid/operators/cos_sim_op.cc @@ -249,8 +249,6 @@ REGISTER_OPERATOR(cos_sim, ops::CosSimGradOpMaker, ops::CosSimGradOpMaker); REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad); -REGISTER_OP_CPU_KERNEL( - cos_sim, ops::CosSimKernel); -REGISTER_OP_CPU_KERNEL( - cos_sim_grad, - ops::CosSimGradKernel); +REGISTER_OP_CPU_KERNEL(cos_sim, ops::CosSimKernel); +REGISTER_OP_CPU_KERNEL(cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc index 3bead0127b823..ee3ff671ede2d 100644 --- a/paddle/fluid/operators/crf_decoding_op.cc +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -215,7 +215,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp, ops::CRFDecodingOpMaker); -REGISTER_OP_CPU_KERNEL( - crf_decoding, - ops::CRFDecodingOpKernel, - ops::CRFDecodingOpKernel); +REGISTER_OP_CPU_KERNEL(crf_decoding, + ops::CRFDecodingOpKernel, + ops::CRFDecodingOpKernel); diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index d1358ca2f44e8..bdc1f61fbe0eb 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -223,14 +223,12 @@ REGISTER_OPERATOR(crop, ops::CropGradOpMaker, ops::GropNoNeedBufferVarInferer); REGISTER_OPERATOR(crop_grad, ops::CropOpGrad); -REGISTER_OP_CPU_KERNEL( - crop, - ops::CropKernel, - ops::CropKernel); -REGISTER_OP_CPU_KERNEL( - crop_grad, - ops::CropGradKernel, - ops::CropGradKernel); +REGISTER_OP_CPU_KERNEL(crop, + ops::CropKernel, + ops::CropKernel); +REGISTER_OP_CPU_KERNEL(crop_grad, + ops::CropGradKernel, + ops::CropGradKernel); REGISTER_OP_CUDA_KERNEL( crop, diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc index 9422de6093441..f72175d4d5338 100644 --- a/paddle/fluid/operators/crop_tensor_op.cc +++ b/paddle/fluid/operators/crop_tensor_op.cc @@ -320,18 +320,16 @@ REGISTER_OPERATOR(crop_tensor, ops::CropTensorGradOpMaker, ops::CropTensorGradOpMaker); REGISTER_OPERATOR(crop_tensor_grad, ops::CropTensorOpGrad); -REGISTER_OP_CPU_KERNEL( - crop_tensor, - ops::CropTensorKernel, - ops::CropTensorKernel, - ops::CropTensorKernel, - ops::CropTensorKernel); -REGISTER_OP_CPU_KERNEL( - crop_tensor_grad, - ops::CropTensorGradKernel, - ops::CropTensorGradKernel, - ops::CropTensorGradKernel, - ops::CropTensorGradKernel); +REGISTER_OP_CPU_KERNEL(crop_tensor, + ops::CropTensorKernel, + ops::CropTensorKernel, + ops::CropTensorKernel, + ops::CropTensorKernel); +REGISTER_OP_CPU_KERNEL(crop_tensor_grad, + ops::CropTensorGradKernel, + ops::CropTensorGradKernel, + ops::CropTensorGradKernel, + ops::CropTensorGradKernel); REGISTER_OP_CUDA_KERNEL( crop_tensor, diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index 5c0c2794bd652..0d98f5b75e4fb 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -421,7 +421,7 @@ class CrossEntropyGradOpMaker2 : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPUCtx = paddle::platform::CPUDeviceContext; +using CPUCtx = phi::CPUContext; REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase, diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc index 6d3da99820517..dbab71e1619ec 100644 --- a/paddle/fluid/operators/ctc_align_op.cc +++ b/paddle/fluid/operators/ctc_align_op.cc @@ -129,7 +129,6 @@ REGISTER_OPERATOR( ops::CTCAlignOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - ctc_align, - ops::CTCAlignKernel, - ops::CTCAlignKernel); +REGISTER_OP_CPU_KERNEL(ctc_align, + ops::CTCAlignKernel, + ops::CTCAlignKernel); diff --git a/paddle/fluid/operators/cum_op.cc b/paddle/fluid/operators/cum_op.cc index 169f1919a7539..b42f26342ab97 100644 --- a/paddle/fluid/operators/cum_op.cc +++ b/paddle/fluid/operators/cum_op.cc @@ -145,7 +145,7 @@ class LogcumsumexpGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor, PD_INFER_META(phi::CumInferMeta)); diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc index e0997635cb42c..6685e54e43b60 100644 --- a/paddle/fluid/operators/data_norm_op.cc +++ b/paddle/fluid/operators/data_norm_op.cc @@ -287,8 +287,7 @@ The required data format for this layer is one of the following: }; template -class DataNormKernel - : public framework::OpKernel { +class DataNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { // const bool is_test = ctx.Attr("is_test"); @@ -533,8 +532,7 @@ class DataNormGradOp : public framework::OperatorWithKernel { }; template -class DataNormGradKernel - : public framework::OpKernel { +class DataNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *x = ctx.Input("X"); @@ -788,14 +786,12 @@ REGISTER_OPERATOR(data_norm, ops::DataNormGradMaker); REGISTER_OPERATOR(data_norm_grad, ops::DataNormGradOp); -REGISTER_OP_CPU_KERNEL( - data_norm, - ops::DataNormKernel, - ops::DataNormKernel); -REGISTER_OP_CPU_KERNEL( - data_norm_grad, - ops::DataNormGradKernel, - ops::DataNormGradKernel); +REGISTER_OP_CPU_KERNEL(data_norm, + ops::DataNormKernel, + ops::DataNormKernel); +REGISTER_OP_CPU_KERNEL(data_norm_grad, + ops::DataNormGradKernel, + ops::DataNormGradKernel); REGISTER_OP_VERSION(data_norm).AddCheckpoint( R"ROC( upgrad data_norm op by adding scale_w to support scale and shift.)ROC", diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc index 45ed3642f1066..f83a4c04a8162 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc @@ -349,7 +349,7 @@ class DeformablePSROIPoolGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR( deformable_psroi_pooling, ops::DeformablePSROIPoolOp, diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc index 1c784d9891b44..64807329a4043 100644 --- a/paddle/fluid/operators/dequantize_abs_max_op.cc +++ b/paddle/fluid/operators/dequantize_abs_max_op.cc @@ -33,8 +33,8 @@ namespace paddle { namespace operators { template -struct DequantizeFunctor { - void operator()(const platform::CPUDeviceContext& dev_ctx, +struct DequantizeFunctor { + void operator()(const phi::CPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* scale, float max_range, @@ -49,8 +49,8 @@ struct DequantizeFunctor { } }; -template struct DequantizeFunctor; -template struct DequantizeFunctor; +template struct DequantizeFunctor; +template struct DequantizeFunctor; class DequantizeMaxAbsOp : public framework::OperatorWithKernel { public: @@ -102,7 +102,7 @@ This calculation is an opposite operation of QuantizeMaxAbsOp: } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR( dequantize_abs_max, diff --git a/paddle/fluid/operators/dequantize_log_op.cc b/paddle/fluid/operators/dequantize_log_op.cc index 28d218ed3a85a..c80c050b14afd 100644 --- a/paddle/fluid/operators/dequantize_log_op.cc +++ b/paddle/fluid/operators/dequantize_log_op.cc @@ -32,8 +32,8 @@ namespace paddle { namespace operators { template -struct DequantizeFunctor { - void operator()(const platform::CPUDeviceContext& dev_ctx, +struct DequantizeFunctor { + void operator()(const phi::CPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* dict, framework::Tensor* out) { @@ -51,7 +51,7 @@ struct DequantizeFunctor { } }; -template struct DequantizeFunctor; +template struct DequantizeFunctor; class DequantizeLogOp : public framework::OperatorWithKernel { public: @@ -108,7 +108,7 @@ This calculation is an opposite operation of QuantizeLogOp: } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR( dequantize_log, diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc index 86b18f8920b87..ef824d2d8cdcd 100644 --- a/paddle/fluid/operators/detection/bipartite_match_op.cc +++ b/paddle/fluid/operators/detection/bipartite_match_op.cc @@ -200,7 +200,7 @@ class BipartiteMatchKernel : public framework::OpKernel { auto* match_indices = context.Output("ColToRowMatchIndices"); auto* match_dist = context.Output("ColToRowMatchDist"); - auto& dev_ctx = context.device_context(); + auto& dev_ctx = context.device_context(); auto col = dist_mat->dims()[1]; @@ -216,9 +216,9 @@ class BipartiteMatchKernel : public framework::OpKernel { match_indices->mutable_data({n, col}, context.GetPlace()); match_dist->mutable_data({n, col}, context.GetPlace()); - phi::funcs::SetConstant iset; + phi::funcs::SetConstant iset; iset(dev_ctx, match_indices, static_cast(-1)); - phi::funcs::SetConstant tset; + phi::funcs::SetConstant tset; tset(dev_ctx, match_dist, static_cast(0)); int* indices = match_indices->data(); diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc index ade3ed5f4de26..cd17a8c9883df 100644 --- a/paddle/fluid/operators/detection/box_clip_op.cc +++ b/paddle/fluid/operators/detection/box_clip_op.cc @@ -104,7 +104,6 @@ REGISTER_OPERATOR( ops::BoxClipOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - box_clip, - ops::BoxClipKernel, - ops::BoxClipKernel); +REGISTER_OP_CPU_KERNEL(box_clip, + ops::BoxClipKernel, + ops::BoxClipKernel); diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h index e27dd30896852..5c816ee3eb5e2 100644 --- a/paddle/fluid/operators/detection/box_clip_op.h +++ b/paddle/fluid/operators/detection/box_clip_op.h @@ -29,8 +29,7 @@ class BoxClipKernel : public framework::OpKernel { auto* input_box = context.Input("Input"); auto* im_info = context.Input("ImInfo"); auto* output_box = context.Output("Output"); - auto& dev_ctx = - context.template device_context(); + auto& dev_ctx = context.template device_context(); output_box->mutable_data(context.GetPlace()); if (input_box->lod().size()) { PADDLE_ENFORCE_EQ(input_box->lod().size(), diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc index 87edd80143a55..64aa86315622f 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -251,7 +251,6 @@ REGISTER_OPERATOR( ops::BoxCoderOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - box_coder, - ops::BoxCoderKernel, - ops::BoxCoderKernel); +REGISTER_OP_CPU_KERNEL(box_coder, + ops::BoxCoderKernel, + ops::BoxCoderKernel); diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc index 05a44dda32a54..d641a6fd41ef7 100644 --- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc @@ -227,7 +227,6 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - box_decoder_and_assign, - ops::BoxDecoderAndAssignKernel, - ops::BoxDecoderAndAssignKernel); +REGISTER_OP_CPU_KERNEL(box_decoder_and_assign, + ops::BoxDecoderAndAssignKernel, + ops::BoxDecoderAndAssignKernel); diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc index 93e9111f1ac61..5473a57902b87 100644 --- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc @@ -122,7 +122,7 @@ class GenerateMaskLabelsOp : public framework::OperatorWithKernel { * to encode class specific mask targets. */ template -static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx, +static inline void ExpandMaskTarget(const phi::CPUContext& ctx, const Tensor& masks, const Tensor& mask_class_labels, const int resolution, @@ -150,7 +150,7 @@ static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx, } template -std::vector SampleMaskForOneImage(const platform::CPUDeviceContext& ctx, +std::vector SampleMaskForOneImage(const phi::CPUContext& ctx, const Tensor& im_info, const Tensor& gt_classes, const Tensor& is_crowd, @@ -391,7 +391,7 @@ class GenerateMaskLabelsKernel : public framework::OpKernel { std::vector lod0(1, 0); int64_t num_mask = 0; - auto& dev_ctx = ctx.device_context(); + auto& dev_ctx = ctx.device_context(); auto gt_classes_lod = gt_classes->lod().back(); auto is_crowd_lod = is_crowd->lod().back(); diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index 749e88c0a9975..7376e0993a506 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -168,7 +168,7 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel { }; template -void Concat(const platform::CPUDeviceContext& context, +void Concat(const phi::CPUContext& context, const Tensor& in_tensor_a, const Tensor& in_tensor_b, Tensor* out_tensor) { @@ -176,24 +176,23 @@ void Concat(const platform::CPUDeviceContext& context, std::vector inputs; inputs.emplace_back(in_tensor_a); inputs.emplace_back(in_tensor_b); - math::ConcatFunctor concat_functor; + math::ConcatFunctor concat_functor; concat_functor(context, inputs, axis, out_tensor); } template -std::vector> SampleFgBgGt( - const platform::CPUDeviceContext& context, - Tensor* iou, - const Tensor& is_crowd, - const int batch_size_per_im, - const float fg_fraction, - const float fg_thresh, - const float bg_thresh_hi, - const float bg_thresh_lo, - std::minstd_rand engine, - const bool use_random, - const bool is_cascade_rcnn, - const Tensor& rpn_rois) { +std::vector> SampleFgBgGt(const phi::CPUContext& context, + Tensor* iou, + const Tensor& is_crowd, + const int batch_size_per_im, + const float fg_fraction, + const float fg_thresh, + const float bg_thresh_hi, + const float bg_thresh_lo, + std::minstd_rand engine, + const bool use_random, + const bool is_cascade_rcnn, + const Tensor& rpn_rois) { std::vector fg_inds; std::vector bg_inds; std::vector mapped_gt_inds; @@ -286,7 +285,7 @@ std::vector> SampleFgBgGt( } template -void GatherBoxesLabels(const platform::CPUDeviceContext& context, +void GatherBoxesLabels(const phi::CPUContext& context, const Tensor& boxes, const Tensor& max_overlap, const Tensor& gt_boxes, @@ -335,7 +334,7 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, template std::vector SampleRoisForOneImage( - const platform::CPUDeviceContext& context, + const phi::CPUContext& context, const Tensor& rpn_rois_in, const Tensor& gt_classes, const Tensor& is_crowd, @@ -372,7 +371,7 @@ std::vector SampleRoisForOneImage( Tensor roi_filter; // Tensor box_filter; if (keep.numel() == 0) { - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; roi_filter.mutable_data({proposals_num, kBoxDim}, context.GetPlace()); set_zero(context, &roi_filter, static_cast(0)); } else { @@ -597,7 +596,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { std::vector lod0(1, 0); int64_t num_rois = 0; - auto& dev_ctx = context.device_context(); + auto& dev_ctx = context.device_context(); auto rpn_rois_lod = rpn_rois->lod().back(); auto gt_classes_lod = gt_classes->lod().back(); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index ba213f10852e7..29d7347f1ba75 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -98,8 +98,7 @@ class GenerateProposalsKernel : public framework::OpKernel { float min_size = context.Attr("min_size"); float eta = context.Attr("eta"); - auto &dev_ctx = - context.template device_context(); + auto &dev_ctx = context.template device_context(); auto &scores_dim = scores->dims(); int64_t num = scores_dim[0]; @@ -122,7 +121,7 @@ class GenerateProposalsKernel : public framework::OpKernel { scores_swap.mutable_data({num, h_score, w_score, c_score}, dev_ctx.GetPlace()); - phi::funcs::Transpose trans; + phi::funcs::Transpose trans; std::vector axis = {0, 2, 3, 1}; trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); trans(dev_ctx, *scores, &scores_swap, axis); @@ -181,7 +180,7 @@ class GenerateProposalsKernel : public framework::OpKernel { } std::pair ProposalForOneImage( - const platform::CPUDeviceContext &ctx, + const phi::CPUContext &ctx, const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances, @@ -234,7 +233,7 @@ class GenerateProposalsKernel : public framework::OpKernel { FilterBoxes(ctx, &proposals, min_size, im_info_slice, true, &keep); // Handle the case when there is no keep index left if (keep.numel() == 0) { - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; bbox_sel.mutable_data({1, 4}, ctx.GetPlace()); set_zero(ctx, &bbox_sel, static_cast(0)); Tensor scores_filter; diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc index 257716b635724..450154bec4e17 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc @@ -99,8 +99,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { float eta = context.Attr("eta"); bool pixel_offset = context.Attr("pixel_offset"); - auto &dev_ctx = - context.template device_context(); + auto &dev_ctx = context.template device_context(); auto &scores_dim = scores->dims(); int64_t num = scores_dim[0]; @@ -123,7 +122,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { scores_swap.mutable_data({num, h_score, w_score, c_score}, dev_ctx.GetPlace()); - phi::funcs::Transpose trans; + phi::funcs::Transpose trans; std::vector axis = {0, 2, 3, 1}; trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); trans(dev_ctx, *scores, &scores_swap, axis); @@ -183,7 +182,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { } std::pair ProposalForOneImage( - const platform::CPUDeviceContext &ctx, + const phi::CPUContext &ctx, const Tensor &im_shape_slice, const Tensor &anchors, const Tensor &variances, @@ -240,7 +239,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { ctx, &proposals, min_size, im_shape_slice, false, &keep, pixel_offset); // Handle the case when there is no keep index left if (keep.numel() == 0) { - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; bbox_sel.mutable_data({1, 4}, ctx.GetPlace()); set_zero(ctx, &bbox_sel, static_cast(0)); Tensor scores_filter; diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc index 504090cfaf7fe..c31c630cd6ccd 100644 --- a/paddle/fluid/operators/detection/iou_similarity_op.cc +++ b/paddle/fluid/operators/detection/iou_similarity_op.cc @@ -113,7 +113,6 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - iou_similarity, - ops::IOUSimilarityKernel, - ops::IOUSimilarityKernel); +REGISTER_OP_CPU_KERNEL(iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc index 6b0608d386f2e..6fb48229517d3 100644 --- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc +++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc @@ -356,7 +356,7 @@ class LocalityAwareNMSKernel : public framework::OpKernel { auto* outs = ctx.Output("Out"); auto& score_dims = scores_input->dims(); auto score_size = score_dims.size(); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); LoDTensor scores; LoDTensor boxes; diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc index e2157b02f92d2..163da3cdd9727 100644 --- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc +++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc @@ -403,7 +403,6 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - mine_hard_examples, - ops::MineHardExamplesKernel, - ops::MineHardExamplesKernel); +REGISTER_OP_CPU_KERNEL(mine_hard_examples, + ops::MineHardExamplesKernel, + ops::MineHardExamplesKernel); diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index 0fb02832be066..68b4ab20150bb 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -219,7 +219,7 @@ class MultiClassNMSKernel : public framework::OpKernel { T nms_threshold = static_cast(ctx.Attr("nms_threshold")); T nms_eta = static_cast(ctx.Attr("nms_eta")); T score_threshold = static_cast(ctx.Attr("score_threshold")); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); int num_det = 0; @@ -361,7 +361,7 @@ class MultiClassNMSKernel : public framework::OpKernel { auto rois_num = ctx.Input("RoisNum"); auto score_dims = scores->dims(); auto score_size = score_dims.size(); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); std::vector>> all_indices; std::vector batch_starts = {0}; diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc index a2a9358ca0d85..915b174f174c5 100644 --- a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc +++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc @@ -507,7 +507,7 @@ class RetinanetDetectionOutputKernel : public framework::OpKernel { int64_t box_dim = box_dims[2]; int64_t out_dim = box_dim + 2; - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); std::vector>> all_nmsed_out; std::vector batch_starts = {0}; diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index 4d7d9fec77dbe..8fbfe2ad8548c 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -112,12 +112,11 @@ void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) { } template -std::vector FilterStraddleAnchor( - const platform::CPUDeviceContext& context, - const Tensor* anchor, - const float rpn_straddle_thresh, - T im_height, - T im_width) { +std::vector FilterStraddleAnchor(const phi::CPUContext& context, + const Tensor* anchor, + const float rpn_straddle_thresh, + T im_height, + T im_width) { std::vector inds_inside; int anchor_num = anchor->dims()[0]; auto* anchor_data = anchor->data(); @@ -154,7 +153,7 @@ std::vector FilterStraddleAnchor( } template -Tensor FilterCrowdGt(const platform::CPUDeviceContext& context, +Tensor FilterCrowdGt(const phi::CPUContext& context, Tensor* gt_boxes, Tensor* is_crowd) { int gt_num = gt_boxes->dims()[0]; @@ -300,7 +299,7 @@ void ScoreAssign(const T* anchor_by_gt_overlap_data, } template -std::vector SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx, +std::vector SampleRpnFgBgGt(const phi::CPUContext& ctx, const Tensor& anchor_by_gt_overlap, const int rpn_batch_size_per_im, const float rpn_positive_overlap, @@ -437,7 +436,7 @@ class RpnTargetAssignKernel : public framework::OpKernel { tgt_bbox->mutable_data({max_num, 4}, place); tgt_lbl->mutable_data({max_num, 1}, place); bbox_inside_weight->mutable_data({max_num, 4}, place); - auto& dev_ctx = context.device_context(); + auto& dev_ctx = context.device_context(); std::random_device rnd; std::minstd_rand engine; @@ -857,11 +856,10 @@ class RetinanetTargetAssignOp : public framework::OperatorWithKernel { }; template -std::vector FilterCrowdGtBoxLabel( - const platform::CPUDeviceContext& context, - Tensor* gt_boxes, - Tensor* gt_labels, - Tensor* is_crowd) { +std::vector FilterCrowdGtBoxLabel(const phi::CPUContext& context, + Tensor* gt_boxes, + Tensor* gt_labels, + Tensor* is_crowd) { int gt_num = gt_boxes->dims()[0]; std::vector not_crowd_inds; auto* is_crowd_data = is_crowd->data(); @@ -893,7 +891,7 @@ std::vector FilterCrowdGtBoxLabel( } template -std::vector GetAllFgBgGt(const platform::CPUDeviceContext& ctx, +std::vector GetAllFgBgGt(const phi::CPUContext& ctx, const Tensor& anchor_by_gt_overlap, const Tensor& ncrowd_gt_labels, const float positive_overlap, @@ -1044,7 +1042,7 @@ class RetinanetTargetAssignKernel : public framework::OpKernel { tgt_lbl->mutable_data({max_num, 1}, place); bbox_inside_weight->mutable_data({max_num, 4}, place); fg_num->mutable_data({batch_num, 1}, place); - auto& dev_ctx = context.device_context(); + auto& dev_ctx = context.device_context(); std::random_device rnd; std::minstd_rand engine; diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc index 9bdc1b645bfe6..bc23c5105db94 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc @@ -266,12 +266,10 @@ REGISTER_OPERATOR(sigmoid_focal_loss, ops::SigmoidFocalLossGradOpMaker, ops::SigmoidFocalLossGradOpMaker); REGISTER_OPERATOR(sigmoid_focal_loss_grad, ops::SigmoidFocalLossGradOp); -REGISTER_OP_CPU_KERNEL( - sigmoid_focal_loss, - ops::SigmoidFocalLossKernel, - ops::SigmoidFocalLossKernel); +REGISTER_OP_CPU_KERNEL(sigmoid_focal_loss, + ops::SigmoidFocalLossKernel, + ops::SigmoidFocalLossKernel); REGISTER_OP_CPU_KERNEL( sigmoid_focal_loss_grad, - ops::SigmoidFocalLossGradKernel, - ops::SigmoidFocalLossGradKernel); + ops::SigmoidFocalLossGradKernel, + ops::SigmoidFocalLossGradKernel); diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc index a6c1db5f78d0e..99deee3f72aea 100644 --- a/paddle/fluid/operators/detection/target_assign_op.cc +++ b/paddle/fluid/operators/detection/target_assign_op.cc @@ -149,8 +149,8 @@ for i-th instance and each `id` of NegIndices in this instance: }; template -struct NegTargetAssignFunctor { - void operator()(const platform::CPUDeviceContext& ctx, +struct NegTargetAssignFunctor { + void operator()(const phi::CPUContext& ctx, const int* neg_indices, const size_t* lod, const int N, @@ -172,10 +172,8 @@ struct NegTargetAssignFunctor { } }; -template struct NegTargetAssignFunctor; -template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; } // namespace operators } // namespace paddle @@ -187,7 +185,6 @@ REGISTER_OPERATOR( ops::TargetAssignOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - target_assign, - ops::TargetAssignKernel, - ops::TargetAssignKernel); +REGISTER_OP_CPU_KERNEL(target_assign, + ops::TargetAssignKernel, + ops::TargetAssignKernel); diff --git a/paddle/fluid/operators/determinant_op.cc b/paddle/fluid/operators/determinant_op.cc index b3af0853095da..b4724eb3c83a3 100644 --- a/paddle/fluid/operators/determinant_op.cc +++ b/paddle/fluid/operators/determinant_op.cc @@ -179,12 +179,10 @@ REGISTER_OPERATOR(slogdeterminant, REGISTER_OPERATOR(slogdeterminant_grad, ops::SlogDeterminantGradOp) // reuse det grad op -REGISTER_OP_CPU_KERNEL( - slogdeterminant, - ops::SlogDeterminantKernel, - ops::SlogDeterminantKernel); - -REGISTER_OP_CPU_KERNEL( - slogdeterminant_grad, - ops::SlogDeterminantGradKernel, - ops::SlogDeterminantGradKernel); +REGISTER_OP_CPU_KERNEL(slogdeterminant, + ops::SlogDeterminantKernel, + ops::SlogDeterminantKernel); + +REGISTER_OP_CPU_KERNEL(slogdeterminant_grad, + ops::SlogDeterminantGradKernel, + ops::SlogDeterminantGradKernel); diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc index 5d0cd4bbc3578..9949fefb1b18b 100644 --- a/paddle/fluid/operators/dgc_clip_by_norm_op.cc +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc @@ -66,6 +66,5 @@ REGISTER_OP_WITHOUT_GRADIENT(dgc_clip_by_norm, ops::DGCClipByNormOp, ops::DGCClipByNormOpMaker); -REGISTER_OP_CPU_KERNEL( - dgc_clip_by_norm, - ops::DGCClipByNormKernel); +REGISTER_OP_CPU_KERNEL(dgc_clip_by_norm, + ops::DGCClipByNormKernel); diff --git a/paddle/fluid/operators/diag_embed_op.cc b/paddle/fluid/operators/diag_embed_op.cc index 0377e40e0a221..531d6f92d8830 100644 --- a/paddle/fluid/operators/diag_embed_op.cc +++ b/paddle/fluid/operators/diag_embed_op.cc @@ -138,9 +138,8 @@ REGISTER_OPERATOR( ops::DiagEmbedOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - diag_embed, - ops::DiagEmbedKernel, - ops::DiagEmbedKernel, - ops::DiagEmbedKernel, - ops::DiagEmbedKernel); +REGISTER_OP_CPU_KERNEL(diag_embed, + ops::DiagEmbedKernel, + ops::DiagEmbedKernel, + ops::DiagEmbedKernel, + ops::DiagEmbedKernel); diff --git a/paddle/fluid/operators/diag_op.cc b/paddle/fluid/operators/diag_op.cc index 9f12e4af47fdd..8ccc5ff3891b9 100644 --- a/paddle/fluid/operators/diag_op.cc +++ b/paddle/fluid/operators/diag_op.cc @@ -59,9 +59,8 @@ REGISTER_OPERATOR( ops::DiagOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - diag, - ops::DiagKernel, - ops::DiagKernel, - ops::DiagKernel, - ops::DiagKernel); +REGISTER_OP_CPU_KERNEL(diag, + ops::DiagKernel, + ops::DiagKernel, + ops::DiagKernel, + ops::DiagKernel); diff --git a/paddle/fluid/operators/dirichlet_op.cc b/paddle/fluid/operators/dirichlet_op.cc index 81a3e63192eb5..ccbe3b62b73dd 100644 --- a/paddle/fluid/operators/dirichlet_op.cc +++ b/paddle/fluid/operators/dirichlet_op.cc @@ -42,11 +42,11 @@ struct GammaCPUFunctor { }; template -struct DirichletSampler { +struct DirichletSampler { void operator()(const framework::ExecutionContext& ctx, const Tensor* alpha, Tensor* out) { - auto& dev_ctx = ctx.device_context(); + auto& dev_ctx = ctx.device_context(); auto p_gen = framework::DefaultCPUGenerator(); auto generator = p_gen->GetCPUEngine(); @@ -71,8 +71,7 @@ struct DirichletSampler { gamma_samples.data(), standard_uniform, standard_normal); - platform::ForRange for_range(dev_ctx, - alpha->numel()); + platform::ForRange for_range(dev_ctx, alpha->numel()); for_range(gamma_functor); // normalize them into a simplex, along the last axis @@ -81,10 +80,10 @@ struct DirichletSampler { new_shape[new_shape.size() - 1] = 1; gamma_sum.mutable_data(new_shape, dev_ctx.GetPlace()); - ReduceKernelFunctor( + ReduceKernelFunctor( &gamma_samples, &gamma_sum, {new_shape.size() - 1}, true, false, ctx) .template apply(); - ElementwiseComputeEx, platform::CPUDeviceContext, T, T>( + ElementwiseComputeEx, phi::CPUContext, T, T>( ctx, &gamma_samples, &gamma_sum, -1, DivFunctor(), out); } }; @@ -125,7 +124,5 @@ REGISTER_OP_WITHOUT_GRADIENT(dirichlet, paddle::operators::DirichletOpMaker); REGISTER_OP_CPU_KERNEL( dirichlet, - paddle::operators::DirichletKernel, - paddle::operators::DirichletKernel); + paddle::operators::DirichletKernel, + paddle::operators::DirichletKernel); diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index bdf08646f1d8b..7733d202e5781 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -91,7 +91,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { TEST(Dropout, CPUDense) { f::Scope scope; p::CPUPlace place; - p::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); Compare(scope, ctx); } diff --git a/paddle/fluid/operators/eig_op.cc b/paddle/fluid/operators/eig_op.cc index 2a7c738f97913..b53bba9fac0c4 100644 --- a/paddle/fluid/operators/eig_op.cc +++ b/paddle/fluid/operators/eig_op.cc @@ -164,19 +164,15 @@ REGISTER_OPERATOR(eig, REGISTER_OPERATOR(eig_grad, ops::EigGradOp); -REGISTER_OP_CPU_KERNEL( - eig, - ops::EigKernel, - ops::EigKernel, - ops::EigKernel, - ops::EigKernel); +REGISTER_OP_CPU_KERNEL(eig, + ops::EigKernel, + ops::EigKernel, + ops::EigKernel, + ops::EigKernel); REGISTER_OP_CPU_KERNEL( eig_grad, - ops::EigGradKernel, - ops::EigGradKernel, - ops:: - EigGradKernel, - ops::EigGradKernel); + ops::EigGradKernel, + ops::EigGradKernel, + ops::EigGradKernel, + ops::EigGradKernel); diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index b677acbe96663..138a987a0bd9f 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -70,7 +70,7 @@ void TransposeTwoAxis(const Tensor& input, permute[axis2] = axis1; transposed_input->mutable_data(input.dims(), context.GetPlace()); - auto& dev_ctx = context.template device_context(); + auto& dev_ctx = context.template device_context(); TransCompute( input.dims().size(), dev_ctx, input, transposed_input, permute); diff --git a/paddle/fluid/operators/eigvals_op.cc b/paddle/fluid/operators/eigvals_op.cc index a01316787f247..cb81a1a64d1d5 100644 --- a/paddle/fluid/operators/eigvals_op.cc +++ b/paddle/fluid/operators/eigvals_op.cc @@ -86,10 +86,9 @@ REGISTER_OPERATOR(eigvals, ops::EigvalsOp, ops::EigvalsOpMaker, ops::EigvalsOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(eigvals, - ops::EigvalsKernel, - ops::EigvalsKernel, - ops::EigvalsKernel>, - ops::EigvalsKernel>); +REGISTER_OP_CPU_KERNEL( + eigvals, + ops::EigvalsKernel, + ops::EigvalsKernel, + ops::EigvalsKernel>, + ops::EigvalsKernel>); diff --git a/paddle/fluid/operators/eigvalsh_op.cc b/paddle/fluid/operators/eigvalsh_op.cc index 3684b926a1ac5..f7abdbee84f1d 100644 --- a/paddle/fluid/operators/eigvalsh_op.cc +++ b/paddle/fluid/operators/eigvalsh_op.cc @@ -151,24 +151,23 @@ REGISTER_OPERATOR(eigvalsh, ops::EigvalshGradOpMaker); REGISTER_OPERATOR(eigvalsh_grad, ops::EigvalshGradOp); -REGISTER_OP_CPU_KERNEL( - eigvalsh, - ops::EigvalshKernel, - ops::EigvalshKernel, - ops::EigvalshKernel>, - ops::EigvalshKernel>); +REGISTER_OP_CPU_KERNEL(eigvalsh, + ops::EigvalshKernel, + ops::EigvalshKernel, + ops::EigvalshKernel>, + ops::EigvalshKernel>); REGISTER_OP_CPU_KERNEL( eigvalsh_grad, - ops::EigvalshGradKernel, - ops::EigvalshGradKernel, - ops::EigvalshGradKernel, + ops::EigvalshGradKernel, + ops::EigvalshGradKernel>, - ops::EigvalshGradKernel>); diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc index af6510ae3b931..6f1e04ebfa6cf 100644 --- a/paddle/fluid/operators/expand_as_op.cc +++ b/paddle/fluid/operators/expand_as_op.cc @@ -146,19 +146,17 @@ REGISTER_OPERATOR(expand_as, REGISTER_OPERATOR(expand_as_grad, ops::ExpandAsGradOp, ops::ExpandAsGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - expand_as, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel); -REGISTER_OP_CPU_KERNEL( - expand_as_grad, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel); +REGISTER_OP_CPU_KERNEL(expand_as, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel); +REGISTER_OP_CPU_KERNEL(expand_as_grad, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( expand_as, diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index 100158ce9c21e..d8c66f95a1395 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -280,19 +280,17 @@ REGISTER_OPERATOR(expand_grad, ops::ExpandDoubleGradOpMaker, ops::ExpandDoubleGradOpMaker, ops::ExpandGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - expand, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel); -REGISTER_OP_CPU_KERNEL( - expand_grad, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel); +REGISTER_OP_CPU_KERNEL(expand, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel); +REGISTER_OP_CPU_KERNEL(expand_grad, + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( expand, diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc index 03fbdfcd5ae77..5a75063fba7c1 100644 --- a/paddle/fluid/operators/exponential_op.cc +++ b/paddle/fluid/operators/exponential_op.cc @@ -62,8 +62,7 @@ class ExponentialOpInferVarType }; template -class ExponentialKernel - : public framework::OpKernel { +class ExponentialKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { auto *out = ctx.Output("Out"); @@ -135,9 +134,8 @@ REGISTER_OPERATOR(exponential_grad, ExponentialGradInferer); REGISTER_OP_CPU_KERNEL(exponential, - ops::ExponentialKernel, - ops::ExponentialKernel); -REGISTER_OP_CPU_KERNEL( - exponential_grad, - ops::ExponentialGradKernel, - ops::ExponentialGradKernel); + ops::ExponentialKernel, + ops::ExponentialKernel); +REGISTER_OP_CPU_KERNEL(exponential_grad, + ops::ExponentialGradKernel, + ops::ExponentialGradKernel); diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc index 58022ee6400fd..4e1df4f98ab57 100644 --- a/paddle/fluid/operators/fake_dequantize_op.cc +++ b/paddle/fluid/operators/fake_dequantize_op.cc @@ -23,8 +23,8 @@ namespace paddle { namespace operators { template -struct DequantizeFunctor { - void operator()(const platform::CPUDeviceContext& dev_ctx, +struct DequantizeFunctor { + void operator()(const phi::CPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor* scale, T max_range, @@ -39,8 +39,8 @@ struct DequantizeFunctor { }; template -struct ChannelDequantizeFunctor { - void operator()(const platform::CPUDeviceContext& dev_ctx, +struct ChannelDequantizeFunctor { + void operator()(const phi::CPUContext& dev_ctx, const framework::Tensor* in, const framework::Tensor** scales, const int scale_num, @@ -139,10 +139,10 @@ struct ChannelDequantizeFunctor { } }; -template struct DequantizeFunctor; -template struct DequantizeFunctor; -template struct ChannelDequantizeFunctor; -template struct ChannelDequantizeFunctor; +template struct DequantizeFunctor; +template struct DequantizeFunctor; +template struct ChannelDequantizeFunctor; +template struct ChannelDequantizeFunctor; class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel { public: @@ -269,7 +269,7 @@ Notes: In general, the per-channel quantization is only applied to weights and t } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR( fake_dequantize_max_abs, diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index 61ee9d49ebeec..cb8263714a5e4 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -32,8 +32,8 @@ struct Compare { }; template -struct FindAbsMaxFunctor { - void operator()(const platform::CPUDeviceContext &ctx, +struct FindAbsMaxFunctor { + void operator()(const phi::CPUContext &ctx, const T *in, const int num, T *out) { @@ -41,11 +41,11 @@ struct FindAbsMaxFunctor { } }; -template struct FindAbsMaxFunctor; +template struct FindAbsMaxFunctor; template -struct FindChannelAbsMaxFunctor { - void operator()(const platform::CPUDeviceContext &ctx, +struct FindChannelAbsMaxFunctor { + void operator()(const phi::CPUContext &ctx, const framework::Tensor &in_tensor, const int quant_axis, T *out_abs_max) { @@ -86,11 +86,11 @@ struct FindChannelAbsMaxFunctor { } }; -template struct FindChannelAbsMaxFunctor; +template struct FindChannelAbsMaxFunctor; template -struct ClipAndFakeQuantFunctor { - void operator()(const platform::CPUDeviceContext &ctx, +struct ClipAndFakeQuantFunctor { + void operator()(const phi::CPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -98,7 +98,7 @@ struct ClipAndFakeQuantFunctor { framework::Tensor *out) { T s = scale.data()[0]; T inv_s = inverse(s); - platform::Transform trans; + platform::Transform trans; if (round_type == 0) { trans(ctx, in.data(), @@ -117,11 +117,11 @@ struct ClipAndFakeQuantFunctor { } }; -template struct ClipAndFakeQuantFunctor; +template struct ClipAndFakeQuantFunctor; template -struct ClipAndFakeQuantDequantFunctor { - void operator()(const platform::CPUDeviceContext &ctx, +struct ClipAndFakeQuantDequantFunctor { + void operator()(const phi::CPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -130,7 +130,7 @@ struct ClipAndFakeQuantDequantFunctor { T s = scale.data()[0]; T inv_s = inverse(s); - platform::Transform trans; + platform::Transform trans; if (round_type == 0) { trans(ctx, in.data(), @@ -151,12 +151,11 @@ struct ClipAndFakeQuantDequantFunctor { } } }; -template struct ClipAndFakeQuantDequantFunctor; +template struct ClipAndFakeQuantDequantFunctor; template -struct ChannelClipAndFakeQuantFunctor { - void operator()(const platform::CPUDeviceContext &ctx, +struct ChannelClipAndFakeQuantFunctor { + void operator()(const phi::CPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -176,7 +175,7 @@ struct ChannelClipAndFakeQuantFunctor { auto *out_data = out->mutable_data(ctx.GetPlace()); auto in_dims = in.dims(); const int64_t channel = in_dims[quant_axis]; - platform::Transform trans; + platform::Transform trans; if (quant_axis == 0) { const int64_t channel_size = in.numel() / channel; for (int64_t i = 0; i < channel; i++) { @@ -235,11 +234,10 @@ struct ChannelClipAndFakeQuantFunctor { } }; -template struct ChannelClipAndFakeQuantFunctor; +template struct ChannelClipAndFakeQuantFunctor; template -struct ChannelClipFakeQuantDequantFunctor { - void operator()(const platform::CPUDeviceContext &ctx, +struct ChannelClipFakeQuantDequantFunctor { + void operator()(const phi::CPUContext &ctx, const framework::Tensor &in, const framework::Tensor &scale, const int bin_cnt, @@ -258,7 +256,7 @@ struct ChannelClipFakeQuantDequantFunctor { auto *out_data = out->mutable_data(ctx.GetPlace()); auto in_dims = in.dims(); const int64_t channel = in_dims[quant_axis]; - platform::Transform trans; + platform::Transform trans; if (quant_axis == 0) { const int64_t channel_size = in.numel() / channel; for (int i = 0; i < channel; i++) { @@ -326,11 +324,10 @@ struct ChannelClipFakeQuantDequantFunctor { } }; -template struct ChannelClipFakeQuantDequantFunctor; +template struct ChannelClipFakeQuantDequantFunctor; template -struct FindRangeAbsMaxFunctor { - void operator()(const platform::CPUDeviceContext &ctx, +struct FindRangeAbsMaxFunctor { + void operator()(const phi::CPUContext &ctx, const framework::Tensor &cur_scale, const framework::Tensor &last_scale, const framework::Tensor &iter, @@ -349,18 +346,17 @@ struct FindRangeAbsMaxFunctor { max = cur; } else if (fabs(removed - max) < 1e-6) { int size = (it > window_size) ? window_size : it; - FindAbsMaxFunctor()( - ctx, scale_arr, size, &max); + FindAbsMaxFunctor()(ctx, scale_arr, size, &max); } out_scale->mutable_data(ctx.GetPlace())[0] = max; } }; -template struct FindRangeAbsMaxFunctor; +template struct FindRangeAbsMaxFunctor; template -struct FindMovingAverageAbsMaxFunctor { - void operator()(const platform::CPUDeviceContext &ctx, +struct FindMovingAverageAbsMaxFunctor { + void operator()(const phi::CPUContext &ctx, const framework::Tensor &in_accum, const framework::Tensor &in_state, const T *cur_scale, @@ -382,8 +378,7 @@ struct FindMovingAverageAbsMaxFunctor { } }; -template struct FindMovingAverageAbsMaxFunctor; +template struct FindMovingAverageAbsMaxFunctor; class FakeQuantOrWithDequantAbsMaxOp : public framework::OperatorWithKernel { public: @@ -968,7 +963,7 @@ class StrightThroughEstimatorMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR( fake_quantize_abs_max, diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index a851a6db5657f..43bb6089a87dd 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -223,7 +223,6 @@ REGISTER_OPERATOR( ops::FCOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - fc, - ops::FCOpKernel, - ops::FCOpKernel); +REGISTER_OP_CPU_KERNEL(fc, + ops::FCOpKernel, + ops::FCOpKernel); diff --git a/paddle/fluid/operators/fill_any_op.cc b/paddle/fluid/operators/fill_any_op.cc index ddbfe226b647e..853ebbdd9e57c 100644 --- a/paddle/fluid/operators/fill_any_op.cc +++ b/paddle/fluid/operators/fill_any_op.cc @@ -95,20 +95,18 @@ REGISTER_OPERATOR(fill_any_grad, REGISTER_OP_CPU_KERNEL( fill_any, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel, - ops::FillAnyKernel); + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel, + ops::FillAnyKernel); REGISTER_OP_CPU_KERNEL( fill_any_grad, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel, - ops::FillAnyGradKernel); + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel, + ops::FillAnyGradKernel); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc index 425222bcd660c..32a19750f420a 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_mlu.cc @@ -68,10 +68,10 @@ class FillConstantBatchSizeLikeOpMLUKernel : public framework::OpKernel { bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace(); if (cpu_place) { auto &dev_ctx = *pool.Get(platform::CPUPlace()); - phi::funcs::SetConstant functor; + phi::funcs::SetConstant functor; out->mutable_data(platform::CPUPlace(), framework::TransToPhiDataType(data_type)); - functor(reinterpret_cast(dev_ctx), + functor(reinterpret_cast(dev_ctx), out, static_cast(value)); } else { diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc index ad4efbb3e0c63..02f89cfdd2691 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc @@ -70,10 +70,10 @@ class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel { bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace(); if (cpu_place) { auto &dev_ctx = *pool.Get(platform::CPUPlace()); - phi::funcs::SetConstant functor; + phi::funcs::SetConstant functor; out->mutable_data(platform::CPUPlace(), framework::TransToPhiDataType(data_type)); - functor(reinterpret_cast(dev_ctx), + functor(reinterpret_cast(dev_ctx), out, static_cast(value)); } else { diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h index dc5079ddb605f..8e51c203d4122 100644 --- a/paddle/fluid/operators/fill_constant_op.h +++ b/paddle/fluid/operators/fill_constant_op.h @@ -124,9 +124,9 @@ class FillConstantKernel : public framework::OpKernel { : ""); tensor->mutable_data(platform::CPUPlace(), framework::TransToPhiDataType(data_type)); - phi::funcs::SetConstant functor; + phi::funcs::SetConstant functor; auto &dev_ctx = *pool.Get(platform::CPUPlace()); - functor(reinterpret_cast(dev_ctx), + functor(reinterpret_cast(dev_ctx), tensor, static_cast(value)); } else if (actual_place == 1) { diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc index b9e91baa1e707..8bd0e328c1f5b 100644 --- a/paddle/fluid/operators/fill_zeros_like_op.cc +++ b/paddle/fluid/operators/fill_zeros_like_op.cc @@ -94,24 +94,22 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL( fill_zeros_like, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel>, - ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel>, + ops::FillZerosLikeKernel>); REGISTER_OP_CPU_KERNEL( fill_zeros_like2, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel>, - ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel>, + ops::FillZerosLikeKernel>); diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index c584dd114e0eb..e160fc6f09ad0 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -438,35 +438,31 @@ REGISTER_OPERATOR(flatten_contiguous_range_grad, ops::FlattenContiguousRangeGradOp, ops::FlattenGradInplaceInferer); -REGISTER_OP_CPU_KERNEL( - flatten, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel, - ops::FlattenKernel); -REGISTER_OP_CPU_KERNEL( - flatten_grad, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel, - ops::FlattenGradKernel); -REGISTER_OP_CPU_KERNEL( - flatten2, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel, - ops::Flatten2Kernel); -REGISTER_OP_CPU_KERNEL( - flatten2_grad, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel, - ops::Flatten2GradKernel); +REGISTER_OP_CPU_KERNEL(flatten, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel); +REGISTER_OP_CPU_KERNEL(flatten_grad, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel); +REGISTER_OP_CPU_KERNEL(flatten2, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel); +REGISTER_OP_CPU_KERNEL(flatten2_grad, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel); diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc index 21e92e6d37511..5ec5a93ada46d 100644 --- a/paddle/fluid/operators/fold_op.cc +++ b/paddle/fluid/operators/fold_op.cc @@ -341,11 +341,9 @@ REGISTER_OPERATOR(fold_grad, ops::FoldGradOp, ops::FoldGradOpNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - fold, - ops::FoldOpKernel, - ops::FoldOpKernel); -REGISTER_OP_CPU_KERNEL( - fold_grad, - ops::FoldGradOpKernel, - ops::FoldGradOpKernel); +REGISTER_OP_CPU_KERNEL(fold, + ops::FoldOpKernel, + ops::FoldOpKernel); +REGISTER_OP_CPU_KERNEL(fold_grad, + ops::FoldGradOpKernel, + ops::FoldGradOpKernel); diff --git a/paddle/fluid/operators/frame_op.cc b/paddle/fluid/operators/frame_op.cc index 25efd98d37afd..45a6bc9994db7 100644 --- a/paddle/fluid/operators/frame_op.cc +++ b/paddle/fluid/operators/frame_op.cc @@ -185,22 +185,18 @@ REGISTER_OPERATOR(frame_grad, ops::FrameOpGrad); REGISTER_OP_CPU_KERNEL( frame, - ops::FrameKernel, - ops::FrameKernel, - ops::FrameKernel, - ops::FrameKernel, - ops::FrameKernel>, - ops::FrameKernel>); + ops::FrameKernel, + ops::FrameKernel, + ops::FrameKernel, + ops::FrameKernel, + ops::FrameKernel>, + ops::FrameKernel>); REGISTER_OP_CPU_KERNEL( frame_grad, - ops::FrameGradKernel, - ops::FrameGradKernel, - ops::FrameGradKernel, - ops::FrameGradKernel, - ops::FrameGradKernel>, - ops::FrameGradKernel>); + ops::FrameGradKernel, + ops::FrameGradKernel, + ops::FrameGradKernel, + ops::FrameGradKernel, + ops::FrameGradKernel>, + ops::FrameGradKernel>); diff --git a/paddle/fluid/operators/fsp_op.cc b/paddle/fluid/operators/fsp_op.cc index e1f82fb27ad0b..ff3a5a638daf0 100644 --- a/paddle/fluid/operators/fsp_op.cc +++ b/paddle/fluid/operators/fsp_op.cc @@ -169,11 +169,9 @@ REGISTER_OPERATOR(fsp, ops::FSPGradOpMaker, ops::FSPGradOpMaker); REGISTER_OPERATOR(fsp_grad, ops::FSPOpGrad); -REGISTER_OP_CPU_KERNEL( - fsp, - ops::FSPOpKernel, - ops::FSPOpKernel); -REGISTER_OP_CPU_KERNEL( - fsp_grad, - ops::FSPGradOpKernel, - ops::FSPGradOpKernel); +REGISTER_OP_CPU_KERNEL(fsp, + ops::FSPOpKernel, + ops::FSPOpKernel); +REGISTER_OP_CPU_KERNEL(fsp_grad, + ops::FSPGradOpKernel, + ops::FSPGradOpKernel); diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc index d9405aa021dc1..31bb78922a5a5 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc @@ -463,17 +463,13 @@ REGISTER_OPERATOR(fused_elemwise_activation_grad, REGISTER_OP_CPU_KERNEL( fused_elemwise_activation, - ops::FusedElemwiseActivationKernel, - ops::FusedElemwiseActivationKernel); + ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel); REGISTER_OP_CPU_KERNEL( fused_elemwise_activation_grad, - ops::FusedElemwiseActivationGradKernel, - ops::FusedElemwiseActivationGradKernel); + ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel); // for memory optimization, we register the fused_elemwise_add_activation OP REGISTER_OPERATOR( @@ -488,14 +484,10 @@ REGISTER_OPERATOR(fused_elemwise_add_activation_grad, REGISTER_OP_CPU_KERNEL( fused_elemwise_add_activation, - ops::FusedElemwiseActivationKernel, - ops::FusedElemwiseActivationKernel); + ops::FusedElemwiseActivationKernel, + ops::FusedElemwiseActivationKernel); REGISTER_OP_CPU_KERNEL( fused_elemwise_add_activation_grad, - ops::FusedElemwiseActivationGradKernel, - ops::FusedElemwiseActivationGradKernel); + ops::FusedElemwiseActivationGradKernel, + ops::FusedElemwiseActivationGradKernel); diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index aae3be9aca568..8f413f34242a8 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -391,7 +391,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { GET_Ht(ct, gates, ht) void SeqCompute(const framework::ExecutionContext& ctx) const { - using DeviceContext = paddle::platform::CPUDeviceContext; + using DeviceContext = phi::CPUContext; INIT_BASE_INPUT_OUTPUT INIT_BASE_SIZES INIT_VEC_FUNC @@ -496,7 +496,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { } void BatchCompute(const framework::ExecutionContext& ctx) const { - using DeviceContext = platform::CPUDeviceContext; + using DeviceContext = phi::CPUContext; INIT_BASE_INPUT_OUTPUT if (ids->lod()[0].size() == 2) { SeqCompute(ctx); diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 937203c92fbf4..c593c65618d78 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -197,7 +197,7 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { const int m = batch_size * idx_width; const int n = table_width; const int k = table_height; - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); blas.CSRMM(&transa, &m, &n, @@ -313,7 +313,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { padding_idx); auto *d_output_data = d_output->data(); - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); int width = static_cast(table_dim[1]); int num_seq = batch_size * idx_width; LOG(INFO) << "num seq = " << num_seq << " width = " << width; diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 9e31d6cfcfb6e..9556ed12880ae 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -310,7 +310,7 @@ class FusionGRUKernel : public framework::OpKernel { T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { - using DeviceContext = paddle::platform::CPUDeviceContext; + using DeviceContext = phi::CPUContext; INIT_BASE_DEFINES; INIT_OTHER_DEFINES; const int N = x_lod[0].size() - 1; @@ -400,7 +400,7 @@ class FusionGRUKernel : public framework::OpKernel { } void BatchCompute(const framework::ExecutionContext& ctx) const { - using DeviceContext = paddle::platform::CPUDeviceContext; + using DeviceContext = phi::CPUContext; INIT_BASE_DEFINES; if (x_lod[0].size() == 2) { xx->Resize({total_T, D3}); diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 282b0a22a8cbe..5454c90b3c596 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -306,23 +306,23 @@ This operator fuse the X into LSTM, more details can refer to LSTM op. template class FuisonLSTMKernel : public framework::OpKernel { public: -#define INIT_BASE_DEFINES \ - using DeviceContext = paddle::platform::CPUDeviceContext; \ - auto* x = ctx.Input("X"); \ - auto* h0 = ctx.Input("H0"); \ - auto* c0 = ctx.Input("C0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* wh = ctx.Input("WeightH"); \ - auto* bias = ctx.Input("Bias"); \ - auto* xx = ctx.Output("XX"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - auto* cell_out = ctx.Output("Cell"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - bool use_peepholes = ctx.Attr("use_peepholes"); \ - auto x_dims = x->dims(); /* T x M*/ \ - auto wh_dims = wh->dims(); /* D x 4D*/ \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ +#define INIT_BASE_DEFINES \ + using DeviceContext = phi::CPUContext; \ + auto* x = ctx.Input("X"); \ + auto* h0 = ctx.Input("H0"); \ + auto* c0 = ctx.Input("C0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* bias = ctx.Input("Bias"); \ + auto* xx = ctx.Output("XX"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + auto* cell_out = ctx.Output("Cell"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + bool use_peepholes = ctx.Attr("use_peepholes"); \ + auto x_dims = x->dims(); /* T x M*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ const int D4 = wh_dims[1] #define INIT_OTHER_DEFINES \ diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc index e88deeae21431..2ebac6d7f7124 100644 --- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc @@ -149,7 +149,7 @@ template class FusionSeqConvEltAddReluKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - using DeviceContext = paddle::platform::CPUDeviceContext; + using DeviceContext = phi::CPUContext; auto* x = ctx.Input("X"); auto* w = ctx.Input("Filter"); auto* b = ctx.Input("Bias"); diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index f022d4156f4fa..6655c6756a5c8 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -149,7 +149,7 @@ template class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - using DeviceContext = paddle::platform::CPUDeviceContext; + using DeviceContext = phi::CPUContext; auto ins = ctx.MultiInput("X"); auto* w = ctx.Input("FCWeight"); auto* b = ctx.Input("FCBias"); @@ -239,7 +239,7 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { auto blas = phi::funcs::GetBlas(ctx); - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx.template device_context(); phi::funcs::FCFunctor fc; fc(dev_ctx, total_T, diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc index 7b72f84191e04..ff983684708aa 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc @@ -22,9 +22,9 @@ namespace operators { using paddle::framework::LoDTensor; using paddle::framework::Tensor; -using paddle::platform::CPUDeviceContext; using paddle::platform::MKLDNNGetDataType; using paddle::platform::MKLDNNMemDesc; +using phi::CPUContext; using platform::to_void_cast; template diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc index 1258e6bfaf21c..748de5dae9520 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc @@ -22,9 +22,9 @@ namespace operators { using paddle::framework::LoDTensor; using paddle::framework::Tensor; -using paddle::platform::CPUDeviceContext; using paddle::platform::MKLDNNGetDataType; using paddle::platform::MKLDNNMemDesc; +using phi::CPUContext; using platform::to_void_cast; template diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h index b999dddf8cfb0..a357a59a09420 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h +++ b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { @@ -19,10 +21,10 @@ namespace operators { using paddle::framework::LoDTensor; using paddle::framework::Tensor; -using paddle::platform::CPUDeviceContext; using paddle::platform::CreateKey; using paddle::platform::MKLDNNGetDataType; using paddle::platform::MKLDNNMemDesc; +using phi::CPUContext; using platform::to_void_cast; template diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc index 44b39b7a80ab7..c59e7d661607c 100644 --- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc @@ -28,10 +28,10 @@ namespace operators { using paddle::framework::LoDTensor; using paddle::framework::Tensor; -using paddle::platform::CPUDeviceContext; using paddle::platform::CreateKey; using paddle::platform::MKLDNNGetDataType; using paddle::platform::MKLDNNMemDesc; +using phi::CPUContext; using phi::vectorize; using platform::to_void_cast; using Direction = dnnl::rnn_direction; diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cc b/paddle/fluid/operators/fused_softmax_mask_op.cc index 013e214b426e5..11c1fa4af8560 100644 --- a/paddle/fluid/operators/fused_softmax_mask_op.cc +++ b/paddle/fluid/operators/fused_softmax_mask_op.cc @@ -117,7 +117,6 @@ REGISTER_OPERATOR(fused_softmax_mask, ops::SoftmaxMaskFuseGradOpMaker, ops::SoftmaxMaskFuseGradOpMaker); REGISTER_OPERATOR(fused_softmax_mask_grad, ops::SoftmaxMaskFuseOpGrad); -REGISTER_OP_CPU_KERNEL( - fused_softmax_mask, - ops::SoftmaxMaskFuseCPUKernel, - ops::SoftmaxMaskFuseCPUKernel); +REGISTER_OP_CPU_KERNEL(fused_softmax_mask, + ops::SoftmaxMaskFuseCPUKernel, + ops::SoftmaxMaskFuseCPUKernel); diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc index f1748ad931247..5992fa2dfc6e4 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc @@ -104,10 +104,7 @@ REGISTER_OPERATOR( ops::SoftmaxMaskFuseUpperTriangleGradOpMaker); REGISTER_OPERATOR(fused_softmax_mask_upper_triangle_grad, ops::SoftmaxMaskFuseUpperTriangleOpGrad); -REGISTER_OP_CPU_KERNEL(fused_softmax_mask_upper_triangle, - ops::SoftmaxMaskFuseUpperTriangleCPUKernel< - paddle::platform::CPUDeviceContext, - float>, - ops::SoftmaxMaskFuseUpperTriangleCPUKernel< - paddle::platform::CPUDeviceContext, - double>); +REGISTER_OP_CPU_KERNEL( + fused_softmax_mask_upper_triangle, + ops::SoftmaxMaskFuseUpperTriangleCPUKernel, + ops::SoftmaxMaskFuseUpperTriangleCPUKernel); diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc index 676143bf01145..11c46d1772957 100644 --- a/paddle/fluid/operators/gather_test.cc +++ b/paddle/fluid/operators/gather_test.cc @@ -39,7 +39,7 @@ TEST(Gather, GatherData) { paddle::platform::CPUPlace()); auto* cpu_place = new paddle::platform::CPUPlace(); - paddle::platform::CPUDeviceContext ctx(*cpu_place); + phi::CPUContext ctx(*cpu_place); phi::funcs::CPUGather(ctx, *src, *index, output); delete cpu_place; cpu_place = NULL; diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cc b/paddle/fluid/operators/graph_khop_sampler_op.cc index 7f45b49518594..4702d66c3ccb3 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cc +++ b/paddle/fluid/operators/graph_khop_sampler_op.cc @@ -132,7 +132,7 @@ Graph Learning Sampling Neighbors operator, for graphsage sampling method. } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR(graph_khop_sampler, ops::GraphKhopSamplerOP, diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc index d3c06ea496f1a..f5cfd7a162c8d 100644 --- a/paddle/fluid/operators/gru_op.cc +++ b/paddle/fluid/operators/gru_op.cc @@ -318,7 +318,7 @@ template class GRUCPUKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { - using DeviceContext = paddle::platform::CPUDeviceContext; + using DeviceContext = phi::CPUContext; using LodTensorPtr = LoDTensor*; bool is_test = context.Attr("is_test"); @@ -588,7 +588,6 @@ REGISTER_OPERATOR(gru_grad, REGISTER_OP_CPU_KERNEL(gru, ops::GRUCPUKernel, ops::GRUCPUKernel); -REGISTER_OP_CPU_KERNEL( - gru_grad, - ops::GRUGradKernel, - ops::GRUGradKernel); +REGISTER_OP_CPU_KERNEL(gru_grad, + ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc index 404d434a88058..24d4771fac539 100644 --- a/paddle/fluid/operators/gru_unit_op.cc +++ b/paddle/fluid/operators/gru_unit_op.cc @@ -325,11 +325,9 @@ REGISTER_OPERATOR(gru_unit_grad, ops::GRUUnitGradOp, ops::GRUUnitGradOpNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - gru_unit, - ops::GRUUnitKernel, - ops::GRUUnitKernel); -REGISTER_OP_CPU_KERNEL( - gru_unit_grad, - ops::GRUUnitGradKernel, - ops::GRUUnitGradKernel); +REGISTER_OP_CPU_KERNEL(gru_unit, + ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CPU_KERNEL(gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index 8de20d53ba8fa..835312851b2e4 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -150,12 +150,10 @@ REGISTER_OPERATOR(hinge_loss, ops::HingeLossGradOpMaker, ops::HingeLossGradOpMaker); REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp); -REGISTER_OP_CPU_KERNEL( - hinge_loss, - ops::HingeLossKernel); -REGISTER_OP_CPU_KERNEL( - hinge_loss_grad, - ops::HingeLossGradKernel); +REGISTER_OP_CPU_KERNEL(hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CPU_KERNEL(hinge_loss_grad, + ops::HingeLossGradKernel); REGISTER_OP_CUDA_KERNEL( hinge_loss, diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index ae8c91e2444ab..dce0ca7a646fd 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -195,12 +195,10 @@ REGISTER_OPERATOR(im2sequence, ops::Im2SequenceGradMaker, ops::Im2SequenceGradMaker); REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp); -REGISTER_OP_CPU_KERNEL( - im2sequence, - ops::Im2SequenceKernel); -REGISTER_OP_CPU_KERNEL( - im2sequence_grad, - ops::Im2SequenceGradKernel); +REGISTER_OP_CPU_KERNEL(im2sequence, + ops::Im2SequenceKernel); +REGISTER_OP_CPU_KERNEL(im2sequence_grad, + ops::Im2SequenceGradKernel); REGISTER_OP_CUDA_KERNEL( im2sequence, diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index dfff354c4bd5e..c1880d2a1a194 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -382,11 +382,9 @@ REGISTER_OPERATOR(inplace_abn, InplaceAbnOpInplaceInferer) REGISTER_OPERATOR(inplace_abn_grad, ops::InplaceABNGradOp) -REGISTER_OP_CPU_KERNEL( - inplace_abn, - ops::InplaceABNKernel, - ops::InplaceABNKernel); -REGISTER_OP_CPU_KERNEL( - inplace_abn_grad, - ops::InplaceABNGradKernel, - ops::InplaceABNGradKernel); +REGISTER_OP_CPU_KERNEL(inplace_abn, + ops::InplaceABNKernel, + ops::InplaceABNKernel); +REGISTER_OP_CPU_KERNEL(inplace_abn_grad, + ops::InplaceABNGradKernel, + ops::InplaceABNGradKernel); diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h index fd8d88ac940de..ff474cfff9727 100644 --- a/paddle/fluid/operators/interpolate_op.h +++ b/paddle/fluid/operators/interpolate_op.h @@ -1201,8 +1201,8 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx, } input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + phi::funcs::SetConstant zero; zero(device_ctx, input_grad, static_cast(0.0)); if (in_w == out_w) { @@ -1279,8 +1279,8 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx, } input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + phi::funcs::SetConstant zero; zero(device_ctx, input_grad, static_cast(0.0)); if (in_h == out_h && in_w == out_w) { @@ -1393,8 +1393,8 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx, dim_grad = {n, in_d, in_h, in_w, c}; } input_grad->mutable_data(dim_grad, ctx.GetPlace()); - auto& device_ctx = ctx.template device_context(); - phi::funcs::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + phi::funcs::SetConstant zero; zero(device_ctx, input_grad, static_cast(0.0)); if (in_d == out_d && in_h == out_h && in_w == out_w) { diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc index 3c84b7c983eff..e93ca5ad54035 100644 --- a/paddle/fluid/operators/inverse_op.cc +++ b/paddle/fluid/operators/inverse_op.cc @@ -137,11 +137,9 @@ REGISTER_OPERATOR(inverse, REGISTER_OPERATOR(inverse_grad, ops::InverseGradOp); -REGISTER_OP_CPU_KERNEL( - inverse, - ops::InverseKernel, - ops::InverseKernel); -REGISTER_OP_CPU_KERNEL( - inverse_grad, - ops::InverseGradKernel, - ops::InverseGradKernel); +REGISTER_OP_CPU_KERNEL(inverse, + ops::InverseKernel, + ops::InverseKernel); +REGISTER_OP_CPU_KERNEL(inverse_grad, + ops::InverseGradKernel, + ops::InverseGradKernel); diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc index bcab28df3a155..77583fd2d30f1 100644 --- a/paddle/fluid/operators/isfinite_op.cc +++ b/paddle/fluid/operators/isfinite_op.cc @@ -122,64 +122,35 @@ namespace ops = paddle::operators; paddle::framework::EmptyGradOpMaker, \ paddle::framework::EmptyGradOpMaker) -#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor) \ - REGISTER_OP_CPU_KERNEL( \ - op_type, \ - ops::OverflowKernel, \ - ops::OverflowKernel, \ - ops::OverflowKernel, \ - ops::OverflowKernel); +#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor) \ + REGISTER_OP_CPU_KERNEL( \ + op_type, \ + ops::OverflowKernel, \ + ops::OverflowKernel, \ + ops::OverflowKernel, \ + ops::OverflowKernel); REGISTER_OP_MAKER(isinf, "isinf(X)"); REGISTER_OP_MAKER(isnan, "isnan(X)"); REGISTER_OP_MAKER(isfinite, "isfinite(X)"); -REGISTER_OP_CPU_KERNEL(isinf, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL(isnan, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL(isfinite, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); +REGISTER_OP_CPU_KERNEL( + isinf, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel); + +REGISTER_OP_CPU_KERNEL( + isnan, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel); + +REGISTER_OP_CPU_KERNEL( + isfinite, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel, + ops::OverflowKernel); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index 56e1c19721378..c7bf0d538bd97 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -93,11 +93,9 @@ REGISTER_OPERATOR(l1_norm, ops::L1NormGradMaker, ops::L1NormGradMaker); REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp); -REGISTER_OP_CPU_KERNEL( - l1_norm, ops::L1NormKernel); -REGISTER_OP_CPU_KERNEL( - l1_norm_grad, - ops::L1NormGradKernel); +REGISTER_OP_CPU_KERNEL(l1_norm, ops::L1NormKernel); +REGISTER_OP_CPU_KERNEL(l1_norm_grad, + ops::L1NormGradKernel); REGISTER_OP_CUDA_KERNEL( l1_norm, ops::L1NormKernel); diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index 397b26e119416..99c10e868a396 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -395,12 +395,10 @@ REGISTER_OPERATOR(linear_chain_crf, REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp, ops::LinearChainCRFGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - linear_chain_crf, - ops::LinearChainCRFOpKernel, - ops::LinearChainCRFOpKernel); +REGISTER_OP_CPU_KERNEL(linear_chain_crf, + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); REGISTER_OP_CPU_KERNEL( linear_chain_crf_grad, - ops::LinearChainCRFGradOpKernel, - ops::LinearChainCRFGradOpKernel); + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h index 8d345b237bfc5..de6daf33f8426 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.h +++ b/paddle/fluid/operators/linear_chain_crf_op.h @@ -129,8 +129,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel { emission_row_max.mutable_data( phi::make_ddim({static_cast(batch_size), 1}), platform::CPUPlace()); - auto& place = *ctx.template device_context() - .eigen_device(); + auto& place = + *ctx.template device_context().eigen_device(); auto x = framework::EigenMatrix::From(emission_weights_tmp); auto x_row_max = framework::EigenMatrix::From(emission_row_max); x_row_max.device(place) = @@ -325,21 +325,20 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { Tensor one_seq_beta = beta.Slice(start_pos, end_pos); Tensor one_seq_emission_grad = emission_grad_tmp.Slice(start_pos, end_pos); - BackwardOneSequence( - ctx.template device_context(), - ll_grad[i], - one_seq_emission_exps, - *transition_exps, - one_seq_alpha, - one_seq_label, - &one_seq_beta, - transition_grad, - &one_seq_emission_grad); + BackwardOneSequence(ctx.template device_context(), + ll_grad[i], + one_seq_emission_exps, + *transition_exps, + one_seq_alpha, + one_seq_label, + &one_seq_beta, + transition_grad, + &one_seq_emission_grad); } }; private: - void BackwardOneSequence(const platform::CPUDeviceContext& ctx, + void BackwardOneSequence(const phi::CPUContext& ctx, const T ll_grad, const Tensor& emission_exps, const Tensor& transition_exps, diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc index c38386365f3dc..fed71abe16637 100644 --- a/paddle/fluid/operators/lite/lite_engine_op_test.cc +++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc @@ -79,7 +79,7 @@ TEST(LiteEngineOp, engine_op) { ctx.PartialInitWithAllocator(); #else platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); #endif // Prepare variables. CreateTensor(&scope, "x", std::vector({2, 4}), true); diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h index c1a67edbfd455..574b7cbec28ce 100644 --- a/paddle/fluid/operators/lite/ut_helper.h +++ b/paddle/fluid/operators/lite/ut_helper.h @@ -60,7 +60,7 @@ void serialize_params(std::string* str, platform::CUDAPlace place; platform::CUDADeviceContext ctx(place); #else - platform::CPUDeviceContext ctx; + phi::CPUContext ctx; #endif for (const auto& param : params) { PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index 65d8a03245f8f..94bfc44977fb3 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -86,10 +86,9 @@ REGISTER_OPERATOR(load_combine, REGISTER_OP_CPU_KERNEL( load_combine, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel, - ops::LoadCombineOpKernel); + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index c1a9782b97a4c..4eebda7d53a3c 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -68,10 +68,9 @@ REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker); REGISTER_OP_CPU_KERNEL( load, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel, - ops::LoadOpKernel); + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel); diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index 4c1a2deeaf480..147b23f56acdc 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -62,7 +62,7 @@ struct LoDTensorToArrayFunctor : public boost::static_visitor { auto &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = pool.Get(place); if (std::is_same::value) { - Apply(static_cast(dev_ctx)); + Apply(static_cast(dev_ctx)); } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) Apply(static_cast(dev_ctx)); diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 3ca39e621b2a4..31a3e40f12e82 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -116,8 +116,7 @@ class LookupTableKernel : public framework::OpKernel { table + id_index * row_width, row_width * sizeof(T)); } else { - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); blas.VCOPY(row_width, table + id_index * row_width, output + i * row_width); @@ -148,8 +147,7 @@ class LookupTableKernel : public framework::OpKernel { table + id_index * row_width, row_width * sizeof(T)); } else { - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); blas.VCOPY(row_width, table + id_index * row_width, output + i * row_width); diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h index 9f9dbe9b336bd..1e12b00ebb944 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.h +++ b/paddle/fluid/operators/lookup_table_v2_op.h @@ -132,8 +132,7 @@ struct LookupTableV2CPUFunctor { table + id_index * row_width, row_width * sizeof(T)); } else { - auto blas = - phi::funcs::GetBlas(context_); + auto blas = phi::funcs::GetBlas(context_); blas.VCOPY(row_width, table + id_index * row_width, output + i * row_width); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 8ec7f3a142c30..73fe170f6d5e8 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -31,7 +31,7 @@ using framework::Tensor; using DataLayout = framework::DataLayout; template -struct LRNFunctor { +struct LRNFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& input, framework::Tensor* out, @@ -46,9 +46,9 @@ struct LRNFunctor { T beta, const DataLayout data_layout) { auto place = ctx.GetPlace(); - auto blas = phi::funcs::GetBlas(ctx); - phi::funcs::Transpose transpose; - auto& dev_ctx = ctx.template device_context(); + auto blas = phi::funcs::GetBlas(ctx); + phi::funcs::Transpose transpose; + auto& dev_ctx = ctx.template device_context(); Tensor in_transpose, mid_transpose, out_transpose; // if channel_last, transpose to channel_first if (data_layout == DataLayout::kNHWC) { @@ -116,11 +116,11 @@ struct LRNFunctor { } } }; -template struct LRNFunctor; -template struct LRNFunctor; +template struct LRNFunctor; +template struct LRNFunctor; template -struct LRNGradFunctor { +struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, @@ -183,8 +183,8 @@ struct LRNGradFunctor { } } }; -template struct LRNGradFunctor; -template struct LRNGradFunctor; +template struct LRNGradFunctor; +template struct LRNGradFunctor; class LRNOp : public framework::OperatorWithKernel { public: @@ -435,7 +435,5 @@ REGISTER_OPERATOR(lrn, ops::LRNGradOpMaker); REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad); -REGISTER_OP_CPU_KERNEL( - lrn, ops::LRNKernel); -REGISTER_OP_CPU_KERNEL( - lrn_grad, ops::LRNGradKernel); +REGISTER_OP_CPU_KERNEL(lrn, ops::LRNKernel); +REGISTER_OP_CPU_KERNEL(lrn_grad, ops::LRNGradKernel); diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc index 668200cf4ff5d..ba56eeddf89d1 100644 --- a/paddle/fluid/operators/lstm_op.cc +++ b/paddle/fluid/operators/lstm_op.cc @@ -356,11 +356,9 @@ REGISTER_OPERATOR(lstm, ops::LSTMGradOpMaker, ops::LSTMGradOpMaker); REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp); -REGISTER_OP_CPU_KERNEL( - lstm, - ops::LSTMKernel, - ops::LSTMKernel); -REGISTER_OP_CPU_KERNEL( - lstm_grad, - ops::LSTMGradKernel, - ops::LSTMGradKernel); +REGISTER_OP_CPU_KERNEL(lstm, + ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CPU_KERNEL(lstm_grad, + ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc index bc064fb61caa4..84e4e5cd2cdf8 100644 --- a/paddle/fluid/operators/lstmp_op.cc +++ b/paddle/fluid/operators/lstmp_op.cc @@ -400,11 +400,9 @@ REGISTER_OPERATOR(lstmp, ops::LSTMPGradMaker, ops::LSTMPGradMaker); REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp); -REGISTER_OP_CPU_KERNEL( - lstmp, - ops::LSTMPKernel, - ops::LSTMPKernel); -REGISTER_OP_CPU_KERNEL( - lstmp_grad, - ops::LSTMPGradKernel, - ops::LSTMPGradKernel); +REGISTER_OP_CPU_KERNEL(lstmp, + ops::LSTMPKernel, + ops::LSTMPKernel); +REGISTER_OP_CPU_KERNEL(lstmp_grad, + ops::LSTMPGradKernel, + ops::LSTMPGradKernel); diff --git a/paddle/fluid/operators/lstsq_op.cc b/paddle/fluid/operators/lstsq_op.cc index 792ede9959f77..70ce5082ced30 100644 --- a/paddle/fluid/operators/lstsq_op.cc +++ b/paddle/fluid/operators/lstsq_op.cc @@ -150,7 +150,6 @@ This API processes Lstsq functor for general matrices. namespace ops = paddle::operators; REGISTER_OPERATOR(lstsq, ops::LstsqOp, ops::LstsqOpMaker) -REGISTER_OP_CPU_KERNEL( - lstsq, - ops::LstsqCPUKernel, - ops::LstsqCPUKernel); +REGISTER_OP_CPU_KERNEL(lstsq, + ops::LstsqCPUKernel, + ops::LstsqCPUKernel); diff --git a/paddle/fluid/operators/lu_op.cc b/paddle/fluid/operators/lu_op.cc index 1021b157ba374..1f569950dad52 100644 --- a/paddle/fluid/operators/lu_op.cc +++ b/paddle/fluid/operators/lu_op.cc @@ -114,9 +114,7 @@ class LUKernel : public framework::OpKernel { "lu without pivoting is not implemented on the CPU, " "but got pivots=False")); - math::DeviceIndependenceTensorOperations - helper(ctx); + math::DeviceIndependenceTensorOperations helper(ctx); *out = helper.Transpose(*xin); auto outdims = out->dims(); @@ -235,5 +233,5 @@ REGISTER_OPERATOR(lu_grad, REGISTER_OP_CPU_KERNEL(lu, ops::LUKernel, ops::LUKernel); REGISTER_OP_CPU_KERNEL(lu_grad, - ops::LUGradKernel, - ops::LUGradKernel); + ops::LUGradKernel, + ops::LUGradKernel); diff --git a/paddle/fluid/operators/lu_unpack_op.cc b/paddle/fluid/operators/lu_unpack_op.cc index b696f3fbd04bb..4c6b37ed3e55e 100644 --- a/paddle/fluid/operators/lu_unpack_op.cc +++ b/paddle/fluid/operators/lu_unpack_op.cc @@ -186,9 +186,8 @@ REGISTER_OPERATOR(lu_unpack_grad, ops::LU_UnpackGradOpVarTypeInference); REGISTER_OP_CPU_KERNEL(lu_unpack, - ops::LU_UnpackKernel, - ops::LU_UnpackKernel); -REGISTER_OP_CPU_KERNEL( - lu_unpack_grad, - ops::LU_UnpackGradKernel, - ops::LU_UnpackGradKernel); + ops::LU_UnpackKernel, + ops::LU_UnpackKernel); +REGISTER_OP_CPU_KERNEL(lu_unpack_grad, + ops::LU_UnpackGradKernel, + ops::LU_UnpackGradKernel); diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc index 4b11497058e43..44f77afee0005 100644 --- a/paddle/fluid/operators/margin_rank_loss_op.cc +++ b/paddle/fluid/operators/margin_rank_loss_op.cc @@ -181,9 +181,7 @@ REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossGradMaker, ops::MarginRankLossGradMaker); REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp); -REGISTER_OP_CPU_KERNEL( - margin_rank_loss, - ops::MarginRankLossKernel); -REGISTER_OP_CPU_KERNEL( - margin_rank_loss_grad, - ops::MarginRankLossGradKernel); +REGISTER_OP_CPU_KERNEL(margin_rank_loss, + ops::MarginRankLossKernel); +REGISTER_OP_CPU_KERNEL(margin_rank_loss_grad, + ops::MarginRankLossGradKernel); diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc index d6a39faea519c..992d9e9f276c4 100644 --- a/paddle/fluid/operators/match_matrix_tensor_op.cc +++ b/paddle/fluid/operators/match_matrix_tensor_op.cc @@ -273,7 +273,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel { memset( bottom_l_trans_data, 0.0, tmp->dims()[0] * tmp->dims()[1] * sizeof(T)); - auto blas = phi::funcs::GetBlas(ctx); + auto blas = phi::funcs::GetBlas(ctx); call_gemm(blas, CblasNoTrans, @@ -295,7 +295,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel { const auto* l_t_data = bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in; const auto* r_data = bottom_r_data + offset_r[b] * dim_in; - auto blas_2 = phi::funcs::GetBlas(ctx); + auto blas_2 = phi::funcs::GetBlas(ctx); call_gemm_with_lda(blas_2, CblasNoTrans, CblasTrans, @@ -388,7 +388,7 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel { } } - auto blas = phi::funcs::GetBlas(ctx); + auto blas = phi::funcs::GetBlas(ctx); auto* t_data = w->data(); auto* d_w = ctx.Output(framework::GradVarName("W")); @@ -456,10 +456,8 @@ REGISTER_OPERATOR(match_matrix_tensor_grad, ops::MatchMatrixTensorOpGrad); REGISTER_OP_CPU_KERNEL( match_matrix_tensor, - ops::CPUMatchMatrixTensorOPKernel); + ops::CPUMatchMatrixTensorOPKernel); REGISTER_OP_CPU_KERNEL( match_matrix_tensor_grad, - ops::CPUMatchMatrixTensorOPGradKernel); + ops::CPUMatchMatrixTensorOPGradKernel); diff --git a/paddle/fluid/operators/math/beam_search_test.cc b/paddle/fluid/operators/math/beam_search_test.cc index b51b9ee0d675a..f6b0349f1ca28 100644 --- a/paddle/fluid/operators/math/beam_search_test.cc +++ b/paddle/fluid/operators/math/beam_search_test.cc @@ -230,8 +230,7 @@ void TestBeamSearch(); + TestBeamSearch(); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 3df69e200190f..603584629cc92 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -38,9 +38,9 @@ namespace math { * each dimension must be the same, except the axis dimension. */ template -class ConcatFunctor { +class ConcatFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const std::vector& input, int axis, framework::Tensor* output) { @@ -54,9 +54,9 @@ class ConcatFunctor { * each dimension must be the same, except the axis dimension. */ template -class SplitFunctor { +class SplitFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::Tensor& input, const std::vector& ref_inputs, const int axis, @@ -335,9 +335,9 @@ class SplitFunctor { }; #endif -#define DEFINE_FUNCTOR(type) \ - template class ConcatFunctor; \ - template class SplitFunctor; +#define DEFINE_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class SplitFunctor; FOR_ALL_TYPES(DEFINE_FUNCTOR); diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc index 3ae314e55d87d..4f0fee91e5919 100644 --- a/paddle/fluid/operators/math/concat_test.cc +++ b/paddle/fluid/operators/math/concat_test.cc @@ -493,8 +493,7 @@ void TestConcatMain(); + TestConcatMain(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) TestConcatMain(); diff --git a/paddle/fluid/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc index 85f012afb505a..0daf46d36fd21 100644 --- a/paddle/fluid/operators/math/cos_sim_functor.cc +++ b/paddle/fluid/operators/math/cos_sim_functor.cc @@ -18,8 +18,8 @@ namespace paddle { namespace operators { namespace math { template -struct CosSimDyFunctor { - void operator()(const platform::CPUDeviceContext& ctx, +struct CosSimDyFunctor { + void operator()(const phi::CPUContext& ctx, const T* x_norm, const T* y_norm, const T* x, @@ -46,8 +46,8 @@ struct CosSimDyFunctor { } }; -template struct CosSimDyFunctor; -template struct CosSimDyFunctor; +template struct CosSimDyFunctor; +template struct CosSimDyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index dcbd66c12b91e..a056341c3bf3c 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -66,7 +66,7 @@ struct MatrixEighFunctor { // symmetric matrices, and uses the variable has_vectors to // control whether to return the eigenvectors. template -struct MatrixEighFunctor { +struct MatrixEighFunctor { public: void operator()(const framework::ExecutionContext &ctx, const Tensor &input, @@ -78,8 +78,7 @@ struct MatrixEighFunctor { auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); auto dito = - math::DeviceIndependenceTensorOperations( - ctx); + math::DeviceIndependenceTensorOperations(ctx); Tensor input_trans; // lapack is a column-major storge, transpose make the input to diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc index 7e543a63afc9e..857d870847ee8 100644 --- a/paddle/fluid/operators/math/gru_compute.cc +++ b/paddle/fluid/operators/math/gru_compute.cc @@ -20,8 +20,8 @@ namespace operators { namespace math { template -struct GRUUnitFunctor { - static void compute(const platform::CPUDeviceContext &context, +struct GRUUnitFunctor { + static void compute(const phi::CPUContext &context, GRUMetaValue value, int frame_size, int batch_size, @@ -29,7 +29,7 @@ struct GRUUnitFunctor { const detail::ActivationType active_gate, bool origin_mode) { #if !defined(__NVCC__) && !defined(__HIPCC___) - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(false, false, @@ -83,8 +83,8 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const platform::CPUDeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const phi::CPUContext &context, GRUMetaValue value, GRUMetaGrad grad, int frame_size, @@ -100,7 +100,7 @@ struct GRUUnitGradFunctor { batch_size, active_node, origin_mode); - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value && grad.prev_out_grad) { blas.GEMM(false, true, @@ -175,15 +175,15 @@ struct GRUUnitGradFunctor { }; template -struct GRUUnitFunctorV2 { - static void compute(const platform::CPUDeviceContext &context, +struct GRUUnitFunctorV2 { + static void compute(const phi::CPUContext &context, GRUMetaValue value, int frame_size, int batch_size, const detail::ActivationType active_node, const detail::ActivationType active_gate) { #if !defined(__NVCC__) && !defined(__HIPCC___) - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(CblasNoTrans, CblasTrans, @@ -226,8 +226,8 @@ struct GRUUnitFunctorV2 { }; template -struct GRUUnitGradFunctorV2 { - static void compute(const platform::CPUDeviceContext &context, +struct GRUUnitGradFunctorV2 { + static void compute(const phi::CPUContext &context, GRUMetaValue value, GRUMetaGrad grad, int frame_size, @@ -245,7 +245,7 @@ struct GRUUnitGradFunctorV2 { batch_size, active_node, active_gate); - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (grad.prev_out_grad && value.prev_out_value) { // update prev_out_grad blas.GEMM(false, @@ -349,15 +349,15 @@ struct GRUUnitGradFunctorV2 { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; -template struct GRUUnitFunctorV2; -template struct GRUUnitFunctorV2; -template struct GRUUnitGradFunctorV2; -template struct GRUUnitGradFunctorV2; +template struct GRUUnitFunctorV2; +template struct GRUUnitFunctorV2; +template struct GRUUnitGradFunctorV2; +template struct GRUUnitGradFunctorV2; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index fc045ba8be458..93ee9d3a15bad 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -341,7 +341,7 @@ void testIm2col(); + testIm2col(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) testIm2col(); @@ -350,7 +350,7 @@ TEST(math, im2col) { #define PREPARE_IM2COL_CPU \ paddle::platform::CPUPlace place; \ - paddle::platform::CPUDeviceContext context(place); \ + phi::CPUContext context(place); \ paddle::framework::Tensor input; \ paddle::framework::Tensor out; \ paddle::framework::Tensor ref; \ @@ -367,7 +367,7 @@ TEST(math, im2col) { ref.mutable_data({ic, fh, fw, output_height, output_width}, place); \ paddle::operators::math::Im2ColFunctor< \ paddle::operators::math::ColFormat::kCFO, \ - paddle::platform::CPUDeviceContext, \ + phi::CPUContext, \ float> \ im2col diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 133680ca9a8c7..8a6f098baefd9 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -137,8 +137,7 @@ struct MatrixBitCodeFunctorMul : public boost::static_visitor { template void operator()(const CodeTable &code_table) { - auto blas = phi::funcs::GetBlas( - platform::CPUDeviceContext()); + auto blas = phi::funcs::GetBlas(phi::CPUContext()); size_t num_samples = tmat_->dims()[0]; size_t tmat_width = tmat_->dims()[1]; size_t input_width = input_.dims()[1]; @@ -185,8 +184,7 @@ struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor { : tmat_(tmat), weight_(weight), input_(input) {} template void operator()(const CodeTable &code_table) { - auto blas = phi::funcs::GetBlas( - platform::CPUDeviceContext()); + auto blas = phi::funcs::GetBlas(phi::CPUContext()); size_t num_samples = tmat_.dims()[0]; size_t input_width = input_.dims()[1]; size_t tmat_width = tmat_.dims()[1]; @@ -239,8 +237,7 @@ struct MatrixBitCodeFunctorMulGradWeightSR template void operator()(const CodeTable &code_table) { - auto blas = phi::funcs::GetBlas( - platform::CPUDeviceContext()); + auto blas = phi::funcs::GetBlas(phi::CPUContext()); size_t num_samples = tmat_.dims()[0]; size_t input_width = input_.dims()[1]; size_t tmat_width = tmat_.dims()[1]; diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc index 4d38dc7137935..b0f8843a530c0 100644 --- a/paddle/fluid/operators/math/matrix_solve.cc +++ b/paddle/fluid/operators/math/matrix_solve.cc @@ -23,18 +23,18 @@ namespace operators { namespace math { template -class MatrixSolveFunctor { +class MatrixSolveFunctor { public: - void operator()(const platform::CPUDeviceContext& dev_ctx, + void operator()(const phi::CPUContext& dev_ctx, const framework::Tensor& a, const framework::Tensor& b, framework::Tensor* out) { - compute_solve_eigen(dev_ctx, a, b, out); + compute_solve_eigen(dev_ctx, a, b, out); } }; -template class MatrixSolveFunctor; -template class MatrixSolveFunctor; +template class MatrixSolveFunctor; +template class MatrixSolveFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 399a1b6dc4ccd..9ec1172c410d8 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -25,8 +25,8 @@ namespace paddle { namespace operators { namespace math { template -struct SelectedRowsAdd { - void operator()(const platform::CPUDeviceContext& context, +struct SelectedRowsAdd { + void operator()(const phi::CPUContext& context, const phi::SelectedRows& input1, const phi::SelectedRows& input2, phi::SelectedRows* output) { @@ -106,12 +106,12 @@ struct SelectedRowsAdd { } }; -template struct SelectedRowsAdd; -template struct SelectedRowsAdd; +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; template -struct SelectedRowsAddTensor { - void operator()(const platform::CPUDeviceContext& context, +struct SelectedRowsAddTensor { + void operator()(const phi::CPUContext& context, const phi::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { @@ -156,7 +156,7 @@ struct SelectedRowsAddTensor { in1_row_numel, output->numel() / in1_height)); - phi::funcs::SetConstant functor; + phi::funcs::SetConstant functor; functor(context, output, 0.0); auto* in1_data = in1_value.data(); @@ -175,12 +175,12 @@ struct SelectedRowsAddTensor { } }; -template struct SelectedRowsAddTensor; -template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; template -struct SelectedRowsAddTo { - void operator()(const platform::CPUDeviceContext& context, +struct SelectedRowsAddTo { + void operator()(const phi::CPUContext& context, const phi::SelectedRows& input1, const int64_t input2_offset, phi::SelectedRows* input2) { @@ -225,14 +225,14 @@ struct SelectedRowsAddTo { } }; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; template -struct SelectedRowsSumTo { - void operator()(const platform::CPUDeviceContext& context, +struct SelectedRowsSumTo { + void operator()(const phi::CPUContext& context, const std::vector& input1, const std::vector& input2_offsets, phi::SelectedRows* input2) { @@ -262,7 +262,7 @@ struct SelectedRowsSumTo { auto* in2_value = input2->mutable_value(); auto* in2_data = in2_value->data(); - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); size_t offset = 0u; for (size_t i = 0u; i != input1.size(); ++i) { auto& in_value = input1[i]->value(); @@ -273,8 +273,8 @@ struct SelectedRowsSumTo { } }; -template struct SelectedRowsSumTo; -template struct SelectedRowsSumTo; +template struct SelectedRowsSumTo; +template struct SelectedRowsSumTo; template struct SelectedRowsAddToTensor { @@ -734,15 +734,15 @@ struct MergeAdd { #endif template -struct MergeAverage { - phi::SelectedRows operator()(const platform::CPUDeviceContext& context, +struct MergeAverage { + phi::SelectedRows operator()(const phi::CPUContext& context, const phi::SelectedRows& input) { phi::SelectedRows out; (*this)(context, input, &out); return out; } - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const phi::SelectedRows& input, phi::SelectedRows* output) { std::vector inputs; @@ -750,7 +750,7 @@ struct MergeAverage { (*this)(context, inputs, output); } - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const std::vector& inputs, phi::SelectedRows* output) { if (inputs.size() == 0) { @@ -803,7 +803,7 @@ struct MergeAverage { out.set_rows(merge_rows); - phi::funcs::SetConstant constant_functor; + phi::funcs::SetConstant constant_functor; constant_functor(context, out.mutable_value(), 0.0); std::unordered_map rows_to_id; @@ -811,7 +811,7 @@ struct MergeAverage { rows_to_id[merge_rows[i]] = i; } - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); for (auto* input : inputs) { if (input->rows().size() == 0) { continue; @@ -841,14 +841,14 @@ struct MergeAverage { template struct MergeAdd; #endif -template struct MergeAverage; -template struct MergeAverage; -template struct MergeAverage; -template struct MergeAverage; +template struct MergeAverage; +template struct MergeAverage; +template struct MergeAverage; +template struct MergeAverage; template -struct UpdateToTensor { - void operator()(const platform::CPUDeviceContext& context, +struct UpdateToTensor { + void operator()(const phi::CPUContext& context, const ScatterOps& op, const phi::SelectedRows& input1, framework::Tensor* input2) { diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index d0383ee5fc21d..ecb8aa7824724 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -19,8 +19,8 @@ limitations under the License. */ TEST(selected_rows_functor, cpu_add) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); - phi::funcs::SetConstant functor; + phi::CPUContext ctx(cpu_place); + phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -48,9 +48,7 @@ TEST(selected_rows_functor, cpu_add) { // simplely concat two SelectedRows out_value->mutable_data(phi::make_ddim({7, 10}), cpu_place); - paddle::operators::math::SelectedRowsAdd - add_functor; + paddle::operators::math::SelectedRowsAdd add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -90,9 +88,8 @@ TEST(selected_rows_functor, cpu_add) { new paddle::framework::Tensor()}; tensor2->mutable_data(phi::make_ddim({height, row_numel}), cpu_place); - paddle::operators::math:: - SelectedRowsAddTensor - add_tensor_functor; + paddle::operators::math::SelectedRowsAddTensor + add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); auto* tensor2_data = tensor2->data(); @@ -114,8 +111,8 @@ TEST(selected_rows_functor, cpu_add) { TEST(selected_rows_functor, cpu_add_to) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); - phi::funcs::SetConstant functor; + phi::CPUContext ctx(cpu_place); + phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -144,8 +141,7 @@ TEST(selected_rows_functor, cpu_add_to) { // simplely concat two SelectedRows out_value->mutable_data(phi::make_ddim({7, 10}), cpu_place); - paddle::operators::math::SelectedRowsAddTo + paddle::operators::math::SelectedRowsAddTo add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -183,9 +179,8 @@ TEST(selected_rows_functor, cpu_add_to) { tensor1->mutable_data(phi::make_ddim({height, row_numel}), cpu_place); functor(ctx, tensor1.get(), 3.0); - paddle::operators::math:: - SelectedRowsAddToTensor - add_to_tensor_functor; + paddle::operators::math::SelectedRowsAddToTensor + add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); auto* tensor1_data = tensor1->data(); @@ -207,8 +202,8 @@ TEST(selected_rows_functor, cpu_add_to) { TEST(selected_rows_functor, cpu_merge_average_float) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); - phi::funcs::SetConstant functor; + phi::CPUContext ctx(cpu_place); + phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -221,9 +216,8 @@ TEST(selected_rows_functor, cpu_merge_average_float) { cpu_place); functor(ctx, in_value, 1.0); - paddle::operators::math::scatter:: - MergeAverage - merge_average_functor; + paddle::operators::math::scatter::MergeAverage + merge_average_functor; phi::SelectedRows output = merge_average_functor(ctx, *selected_rows); auto out_height = output.height(); @@ -243,8 +237,8 @@ TEST(selected_rows_functor, cpu_merge_average_float) { TEST(selected_rows_functor, cpu_merge_add_float) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); - phi::funcs::SetConstant functor; + phi::CPUContext ctx(cpu_place); + phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -259,8 +253,7 @@ TEST(selected_rows_functor, cpu_merge_add_float) { std::unique_ptr output{new phi::SelectedRows()}; - paddle::operators::math::scatter::MergeAdd + paddle::operators::math::scatter::MergeAdd merge_add_functor; merge_add_functor(ctx, *selected_rows, output.get()); @@ -281,8 +274,8 @@ TEST(selected_rows_functor, cpu_merge_add_float) { TEST(selected_rows_functor, cpu_merge_add_int) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); - phi::funcs::SetConstant functor; + phi::CPUContext ctx(cpu_place); + phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -297,8 +290,7 @@ TEST(selected_rows_functor, cpu_merge_add_int) { std::unique_ptr output{new phi::SelectedRows()}; - paddle::operators::math::scatter::MergeAdd + paddle::operators::math::scatter::MergeAdd merge_add_functor; merge_add_functor(ctx, *selected_rows, output.get()); @@ -319,8 +311,8 @@ TEST(selected_rows_functor, cpu_merge_add_int) { TEST(selected_rows_functor, cpu_merge_add_multi) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); - phi::funcs::SetConstant set_const; + phi::CPUContext ctx(cpu_place); + phi::funcs::SetConstant set_const; int64_t height = 10; int64_t row_numel = 8; @@ -345,8 +337,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { std::unique_ptr output{new phi::SelectedRows()}; output->set_height(height); - paddle::operators::math::scatter::MergeAdd + paddle::operators::math::scatter::MergeAdd merge_add_functor; std::vector inputs; @@ -370,8 +361,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); - phi::funcs::SetConstant set_const; + phi::CPUContext ctx(cpu_place); + phi::funcs::SetConstant set_const; int64_t height = 10; int64_t row_numel = 8; @@ -396,8 +387,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { std::unique_ptr output{new phi::SelectedRows()}; output->set_height(height); - paddle::operators::math::scatter::MergeAdd + paddle::operators::math::scatter::MergeAdd merge_add_functor; std::vector inputs; @@ -427,8 +417,8 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { TEST(selected_rows_functor, cpu_sum_to) { paddle::platform::CPUPlace cpu_place; - paddle::platform::CPUDeviceContext ctx(cpu_place); - phi::funcs::SetConstant functor; + phi::CPUContext ctx(cpu_place); + phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; std::vector rows1{0, 4, 7}; @@ -454,8 +444,7 @@ TEST(selected_rows_functor, cpu_sum_to) { auto* out_value = output->mutable_value(); // simplely concat two SelectedRows out_value->mutable_data(phi::make_ddim({7, 10}), cpu_place); - paddle::operators::math::SelectedRowsSumTo + paddle::operators::math::SelectedRowsSumTo sum_to_functor; sum_to_functor(ctx, std::vector( @@ -490,9 +479,8 @@ TEST(selected_rows_functor, cpu_sum_to) { new paddle::framework::Tensor()}; tensor1->mutable_data(phi::make_ddim({height, row_numel}), cpu_place); functor(ctx, tensor1.get(), 3.0); - paddle::operators::math:: - SelectedRowsAddToTensor - add_to_tensor_functor; + paddle::operators::math::SelectedRowsAddToTensor + add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); auto* tensor1_data = tensor1->data(); // row0: 1.0 + 2.0 + 3.0 diff --git a/paddle/fluid/operators/math/sequence_padding_test.cc b/paddle/fluid/operators/math/sequence_padding_test.cc index 3b30f9358a3a0..06eca480ec622 100644 --- a/paddle/fluid/operators/math/sequence_padding_test.cc +++ b/paddle/fluid/operators/math/sequence_padding_test.cc @@ -101,18 +101,16 @@ void TestSequencePadding(const DeviceContext &context, TEST(Seq2BatchPadding, CPU) { auto place = paddle::platform::CPUPlace(); - auto *context = static_cast( + auto *context = static_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::framework::LoD lod1; lod1.push_back(std::vector{0, 10}); - TestSequencePadding( - *context, lod1, 16); + TestSequencePadding(*context, lod1, 16); paddle::framework::LoD lod2; lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePadding( - *context, lod2, 128); + TestSequencePadding(*context, lod2, 128); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 69675f5e9219a..a600c37a89108 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -38,7 +38,7 @@ using EigenMatrix = framework::EigenMatrix; template class MaxSeqPoolFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::LoDTensor& input, T pad_value, framework::LoDTensor* output, @@ -117,7 +117,7 @@ class MaxSeqPoolFunctor { template class MaxSeqPoolFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::LoDTensor& input, T pad_value, framework::LoDTensor* output, @@ -178,7 +178,7 @@ class MaxSeqPoolFunctor { template class MaxSeqPoolGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::LoDTensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad) { @@ -224,7 +224,7 @@ class MaxSeqPoolGradFunctor { const int* max_index = index.data(); T* ig_data = in_grad->data(); - phi::funcs::SetConstant set_zero; + phi::funcs::SetConstant set_zero; set_zero(context, in_grad, static_cast(0.0)); int64_t num_seq = og_dims[0]; int64_t dim = out_grad.numel() / num_seq; @@ -241,7 +241,7 @@ class MaxSeqPoolGradFunctor { template class LastSeqPoolFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::LoDTensor& input, T pad_value, framework::LoDTensor* output) { @@ -275,7 +275,7 @@ class LastSeqPoolFunctor { template class FirstSeqPoolFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::LoDTensor& input, T pad_value, framework::LoDTensor* output) { @@ -309,7 +309,7 @@ class FirstSeqPoolFunctor { template class SumSeqPoolGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::LoDTensor& out_grad, framework::LoDTensor* in_grad) { auto lod_level = in_grad->lod().size(); @@ -328,7 +328,7 @@ class SumSeqPoolGradFunctor { out_w)); const T* out_g_data = out_grad.data(); T* in_g_data = in_grad->mutable_data(context.GetPlace()); - auto blas = phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); if (h == 0) continue; @@ -343,10 +343,10 @@ class SumSeqPoolGradFunctor { }; template -class SequencePoolFunctor { +class SequencePoolFunctor { public: /* max pool has index output */ - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const std::string pooltype, T pad_value, const framework::LoDTensor& input, @@ -435,9 +435,9 @@ class SequencePoolFunctor { }; template -class SequencePoolGradFunctor { +class SequencePoolGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const std::string pooltype, const framework::LoDTensor& out_grad, framework::LoDTensor* in_grad, @@ -451,7 +451,7 @@ class SequencePoolGradFunctor { if (pooltype == "LAST" || pooltype == "FIRST") { // set X@Grad be zero at first when pooltype is LAST/FIRST - phi::funcs::SetConstant functor; + phi::funcs::SetConstant functor; functor(context, in_grad, 0); } @@ -495,10 +495,10 @@ class SequencePoolGradFunctor { } }; -template class SequencePoolFunctor; -template class SequencePoolFunctor; -template class SequencePoolGradFunctor; -template class SequencePoolGradFunctor; +template class SequencePoolFunctor; +template class SequencePoolFunctor; +template class SequencePoolGradFunctor; +template class SequencePoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc index ec82a2439a6c3..63d922b7ebb80 100644 --- a/paddle/fluid/operators/math/sequence_pooling_test.cc +++ b/paddle/fluid/operators/math/sequence_pooling_test.cc @@ -117,18 +117,16 @@ void TestSequencePoolingSum(const DeviceContext &context, TEST(SequencePoolingGrad, CPU_SUM) { auto place = paddle::platform::CPUPlace(); - auto *context = static_cast( + auto *context = static_cast( paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::framework::LoD lod1; lod1.push_back(std::vector{0, 10}); - TestSequencePoolingSum( - *context, lod1, 128); + TestSequencePoolingSum(*context, lod1, 128); paddle::framework::LoD lod2; lod2.push_back(std::vector{0, 2, 7, 10}); - TestSequencePoolingSum( - *context, lod2, 128); + TestSequencePoolingSum(*context, lod2, 128); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 93ae9cad7674e..18cd3e7261dd7 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -226,7 +226,7 @@ void SoftmaxFunctor::operator()( template using enable_if_CPU = typename std::enable_if< - std::is_same::value>::type; + std::is_same::value>::type; template class SoftmaxFunctor> { diff --git a/paddle/fluid/operators/math/squared_l2_norm.h b/paddle/fluid/operators/math/squared_l2_norm.h index ba584953a0d1e..3054d5f8f0029 100644 --- a/paddle/fluid/operators/math/squared_l2_norm.h +++ b/paddle/fluid/operators/math/squared_l2_norm.h @@ -34,7 +34,7 @@ namespace operators { namespace math { template -void SquaredL2Norm(const platform::CPUDeviceContext& ctx, +void SquaredL2Norm(const phi::CPUContext& ctx, const T1* x, T2* y, size_t numel, diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc index fae1122fa0596..70f377e42e59f 100644 --- a/paddle/fluid/operators/math/tree2col.cc +++ b/paddle/fluid/operators/math/tree2col.cc @@ -84,9 +84,9 @@ void Tree2ColUtil::construct_tree(const framework::Tensor &EdgeSet, } template -class Tree2ColFunctor { +class Tree2ColFunctor { public: - void operator()(const platform::CPUDeviceContext &context, + void operator()(const phi::CPUContext &context, const framework::Tensor &EdgeSet, const framework::Tensor &node_features, framework::Tensor *patch, @@ -94,7 +94,7 @@ class Tree2ColFunctor { std::vector> tr; const auto &feature_dims = node_features.dims(); auto cpu_place = context.GetPlace(); - phi::funcs::SetConstant constant; + phi::funcs::SetConstant constant; int64_t feature_size = feature_dims[1]; size_t patch_elem_size = 3 * static_cast(feature_size); size_t node_count = 0, patch_count = 0, patch_size; @@ -138,9 +138,9 @@ class Tree2ColFunctor { } }; template -class Col2TreeFunctor { +class Col2TreeFunctor { public: - void operator()(const platform::CPUDeviceContext &context, + void operator()(const phi::CPUContext &context, const framework::Tensor &EdgeSet, const framework::Tensor &out_grad, framework::Tensor *in_grad, @@ -148,7 +148,7 @@ class Col2TreeFunctor { std::vector> tr; const auto &output_dims = out_grad.dims(); auto cpu_place = context.GetPlace(); - phi::funcs::SetConstant constant; + phi::funcs::SetConstant constant; int64_t output_size = output_dims[1]; size_t grad_elem_size = 3 * static_cast(output_size); size_t node_count = 0, grad_count = 0; @@ -195,10 +195,10 @@ class Col2TreeFunctor { } }; -template class Tree2ColFunctor; -template class Tree2ColFunctor; -template class Col2TreeFunctor; -template class Col2TreeFunctor; +template class Tree2ColFunctor; +template class Tree2ColFunctor; +template class Col2TreeFunctor; +template class Col2TreeFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc index c834a03f9731b..d119e814585b5 100644 --- a/paddle/fluid/operators/math/unpooling.cc +++ b/paddle/fluid/operators/math/unpooling.cc @@ -18,9 +18,9 @@ namespace paddle { namespace operators { namespace math { template -class Unpool2dMaxFunctor { +class Unpool2dMaxFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { @@ -61,9 +61,9 @@ class Unpool2dMaxFunctor { } }; template -class Unpool2dMaxGradFunctor { +class Unpool2dMaxGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -107,9 +107,9 @@ class Unpool2dMaxGradFunctor { }; template -class Unpool3dMaxFunctor { +class Unpool3dMaxFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { @@ -153,9 +153,9 @@ class Unpool3dMaxFunctor { } }; template -class Unpool3dMaxGradFunctor { +class Unpool3dMaxGradFunctor { public: - void operator()(const platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -201,14 +201,14 @@ class Unpool3dMaxGradFunctor { } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; -template class Unpool3dMaxGradFunctor; -template class Unpool3dMaxGradFunctor; -template class Unpool3dMaxFunctor; -template class Unpool3dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; +template class Unpool3dMaxGradFunctor; +template class Unpool3dMaxGradFunctor; +template class Unpool3dMaxFunctor; +template class Unpool3dMaxFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index f2c5fa88fda60..ec3926b95ee87 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -254,7 +254,7 @@ void testVol2col(); + testVol2col(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) testVol2col(); diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 13df41852dd5a..c79073861ab6e 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -1041,19 +1041,16 @@ REGISTER_OPERATOR(matmul_grad, ops::MatMulOpDoubleGradMaker, ops::MatMulOpDoubleGradMaker); REGISTER_OPERATOR(matmul_grad_grad, ops::MatMulOpDoubleGrad); -REGISTER_OP_CPU_KERNEL( - matmul, - ops::MatMulKernel, - ops::MatMulKernel); -REGISTER_OP_CPU_KERNEL( - matmul_grad, - ops::MatMulGradKernel, - ops::MatMulGradKernel); - -REGISTER_OP_CPU_KERNEL( - matmul_grad_grad, - ops::MatMulDoubleGradKernel, - ops::MatMulDoubleGradKernel); +REGISTER_OP_CPU_KERNEL(matmul, + ops::MatMulKernel, + ops::MatMulKernel); +REGISTER_OP_CPU_KERNEL(matmul_grad, + ops::MatMulGradKernel, + ops::MatMulGradKernel); + +REGISTER_OP_CPU_KERNEL(matmul_grad_grad, + ops::MatMulDoubleGradKernel, + ops::MatMulDoubleGradKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h index 88f8be63f26c8..b5ac6fd677bac 100644 --- a/paddle/fluid/operators/mean_iou_op.h +++ b/paddle/fluid/operators/mean_iou_op.h @@ -31,8 +31,8 @@ template class MeanIoUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto& place = *ctx.template device_context() - .eigen_device(); + auto& place = + *ctx.template device_context().eigen_device(); // get input and output tensor auto* predictions = ctx.Input("Predictions"); auto* labels = ctx.Input("Labels"); diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc index fc1944b2ad6fb..ef89a730a0ff9 100644 --- a/paddle/fluid/operators/merge_selected_rows_op.cc +++ b/paddle/fluid/operators/merge_selected_rows_op.cc @@ -100,7 +100,6 @@ REGISTER_OPERATOR(merge_selected_rows, ops::MergeSelectedRowsOpMaker, ops::MergeSelectedRowsOpInferVarType); -REGISTER_OP_CPU_KERNEL( - merge_selected_rows, - ops::MergeSelectedRowsKernel, - ops::MergeSelectedRowsKernel); +REGISTER_OP_CPU_KERNEL(merge_selected_rows, + ops::MergeSelectedRowsKernel, + ops::MergeSelectedRowsKernel); diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc index 5edc39f8fc7b8..1e369c81538ed 100644 --- a/paddle/fluid/operators/minus_op.cc +++ b/paddle/fluid/operators/minus_op.cc @@ -153,8 +153,7 @@ REGISTER_OPERATOR(minus, ops::MinusOpMaker, ops::MinusGradDescMaker, ops::MinusGradMaker); -REGISTER_OP_CPU_KERNEL( - minus, ops::MinusKernel); +REGISTER_OP_CPU_KERNEL(minus, ops::MinusKernel); REGISTER_OP_CUDA_KERNEL( minus, ops::MinusKernel); diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index 058be90cd82ac..ea56b84c90889 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -198,8 +198,7 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { x_dims = x->dims(); auto axes = ctx.Attr("axis"); out_dims = phi::make_ddim( - FlattenKernel::GetOutputShape( - axes, x_dims)); + FlattenKernel::GetOutputShape(axes, x_dims)); } protected: diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index bd6d55fb7b3fa..2202349bd66c1 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -41,8 +41,8 @@ class MKLDNNDeviceContext; namespace paddle { namespace operators { -using paddle::platform::CPUDeviceContext; using paddle::platform::MKLDNNDeviceContext; +using phi::CPUContext; using platform::to_void_cast; template diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc index f3fcab3ca5490..17f323d0bcba8 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cc +++ b/paddle/fluid/operators/modified_huber_loss_op.cc @@ -176,8 +176,7 @@ REGISTER_OPERATOR( ops::ModifiedHuberLossGradOpMaker); REGISTER_OPERATOR(modified_huber_loss_grad, ops::ModifiedHuberLossGradOp); -REGISTER_OP_CPU_KERNEL( - modified_huber_loss, - ops::ModifiedHuberLossKernel); +REGISTER_OP_CPU_KERNEL(modified_huber_loss, + ops::ModifiedHuberLossKernel); REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad, ops::ModifiedHuberLossGradCPUKernel); diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index e43112f423692..76737f2bc35a7 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -100,7 +100,7 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; DECLARE_INFER_SHAPE_FUNCTOR(norm, NormInferShapeFunctor, diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc index e749a267c970b..59842249adcdd 100644 --- a/paddle/fluid/operators/one_hot_op.cc +++ b/paddle/fluid/operators/one_hot_op.cc @@ -133,7 +133,6 @@ REGISTER_OPERATOR( ops::OneHotOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - one_hot, - ops::OneHotKernel, - ops::OneHotKernel); +REGISTER_OP_CPU_KERNEL(one_hot, + ops::OneHotKernel, + ops::OneHotKernel); diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc index 8b9569db1a63c..90ce98c4dc316 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc @@ -131,6 +131,5 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp, ops::DecayedAdagradOpMaker); -REGISTER_OP_CPU_KERNEL( - decayed_adagrad, - ops::DecayedAdagradOpKernel); +REGISTER_OP_CPU_KERNEL(decayed_adagrad, + ops::DecayedAdagradOpKernel); diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc index f283cbd21ef9e..09847ff216f5a 100644 --- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc @@ -74,6 +74,5 @@ REGISTER_OP_WITHOUT_GRADIENT(dgc_momentum, ops::DGCMomentumOp, ops::DGCMomentumOpMaker); -REGISTER_OP_CPU_KERNEL( - dgc_momentum, - ops::DGCMomentumKernel); +REGISTER_OP_CPU_KERNEL(dgc_momentum, + ops::DGCMomentumKernel); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc index 95b45934ea6d2..e32cf36251742 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc @@ -120,4 +120,4 @@ REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb_init, REGISTER_OP_CPU_KERNEL( distributed_fused_lamb_init, - ops::DistributedFusedLambInitOpKernel); + ops::DistributedFusedLambInitOpKernel); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc index 224e2a4de3f74..b85eb16a39cf2 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc @@ -167,4 +167,4 @@ REGISTER_OP_WITHOUT_GRADIENT(distributed_fused_lamb, REGISTER_OP_CPU_KERNEL( distributed_fused_lamb, - ops::DistributedFusedLambOpKernel); + ops::DistributedFusedLambOpKernel); diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc index 023c3f27cf29e..ad1262a7d2d55 100644 --- a/paddle/fluid/operators/optimizers/dpsgd_op.cc +++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc @@ -132,7 +132,6 @@ CCS16 - Deep Learning with Differential Privacy. namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(dpsgd, ops::DpsgdOp, ops::DpsgdOpMaker); -REGISTER_OP_CPU_KERNEL( - dpsgd, - ops::DpsgdOpKernel, - ops::DpsgdOpKernel); +REGISTER_OP_CPU_KERNEL(dpsgd, + ops::DpsgdOpKernel, + ops::DpsgdOpKernel); diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc index edafacf508dcb..50060b1636943 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.cc +++ b/paddle/fluid/operators/optimizers/ftrl_op.cc @@ -157,5 +157,4 @@ The paper that proposed Follow The Regularized Leader (FTRL): namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker); -REGISTER_OP_CPU_KERNEL( - ftrl, ops::FTRLOpKernel); +REGISTER_OP_CPU_KERNEL(ftrl, ops::FTRLOpKernel); diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc index e2df17fd720ad..8434da2bb0e76 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.cc +++ b/paddle/fluid/operators/optimizers/lamb_op.cc @@ -247,10 +247,9 @@ learning rate, $\lambda$ the weight decay rate. namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(lamb, ops::LambOp, ops::LambOpMaker); -REGISTER_OP_CPU_KERNEL( - lamb, - ops::LambOpKernel, - ops::LambOpKernel); +REGISTER_OP_CPU_KERNEL(lamb, + ops::LambOpKernel, + ops::LambOpKernel); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(lamb).AddCheckpoint( diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cc b/paddle/fluid/operators/optimizers/merged_adam_op.cc index 042905ddfe489..69ca8ec3c6670 100644 --- a/paddle/fluid/operators/optimizers/merged_adam_op.cc +++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc @@ -135,7 +135,6 @@ REGISTER_OP_WITHOUT_GRADIENT(merged_adamw, ops::MergedAdamOp, ops::MergedAdamOpMaker); -REGISTER_OP_CPU_KERNEL( - merged_adam, - ops::MergedAdamOpKernel, - ops::MergedAdamOpKernel); +REGISTER_OP_CPU_KERNEL(merged_adam, + ops::MergedAdamOpKernel, + ops::MergedAdamOpKernel); diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc index b640e47e6e638..e6aec5cec9e66 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cc @@ -104,7 +104,6 @@ REGISTER_OP_WITHOUT_GRADIENT(merged_momentum, ops::MergedMomentumOp, ops::MergedMomentumOpMaker); -REGISTER_OP_CPU_KERNEL( - merged_momentum, - ops::MergedMomentumOpKernel, - ops::MergedMomentumOpKernel); +REGISTER_OP_CPU_KERNEL(merged_momentum, + ops::MergedMomentumOpKernel, + ops::MergedMomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc index 7486c0c2b8cbe..e332972f7576a 100644 --- a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc +++ b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc @@ -23,7 +23,7 @@ namespace paddle { namespace operators { template -class SGDOneDNNKernel : public SGDOpKernel { +class SGDOneDNNKernel : public SGDOpKernel { protected: void dense_param_and_grad_kernel( const framework::ExecutionContext &ctx) const override { diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc index 5eeeb7353072e..f576827f9cadf 100644 --- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc @@ -85,5 +85,5 @@ REGISTER_OP_WITHOUT_GRADIENT(pow2_decay_with_linear_warmup, ops::Pow2DecayWithLinearWarmupOpMaker); REGISTER_OP_CPU_KERNEL( pow2_decay_with_linear_warmup, - ops::Pow2DecayWithLinearWarmupOpKernel, - ops::Pow2DecayWithLinearWarmupOpKernel); + ops::Pow2DecayWithLinearWarmupOpKernel, + ops::Pow2DecayWithLinearWarmupOpKernel); diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc index a5424f5cda5f0..072e39dd91cc0 100644 --- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc @@ -134,6 +134,5 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp, ops::ProximalAdagradOpMaker); -REGISTER_OP_CPU_KERNEL( - proximal_adagrad, - ops::ProximalAdagradOpKernel); +REGISTER_OP_CPU_KERNEL(proximal_adagrad, + ops::ProximalAdagradOpKernel); diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc index dc7e9c90af59f..50676863678c1 100644 --- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc +++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc @@ -107,6 +107,5 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp, ops::ProximalGDOpMaker); -REGISTER_OP_CPU_KERNEL( - proximal_gd, - ops::ProximalGDOpKernel); +REGISTER_OP_CPU_KERNEL(proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index cb87850f43c5e..02d8bcbd279dc 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -159,8 +159,7 @@ class SGDOpKernel : public framework::OpKernel { }; template -class SGDOpKernel - : public framework::OpKernel { +class SGDOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *param_var = ctx.InputVar("Param"); diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc index 48f211f9c5ace..a92bbbc838a8a 100644 --- a/paddle/fluid/operators/optimizers/sparse_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.cc @@ -119,7 +119,6 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, ops::SparseMomentumOpInferVarType); -REGISTER_OP_CPU_KERNEL( - sparse_momentum, - ops::SparseMomentumOpKernel, - ops::SparseMomentumOpKernel); +REGISTER_OP_CPU_KERNEL(sparse_momentum, + ops::SparseMomentumOpKernel, + ops::SparseMomentumOpKernel); diff --git a/paddle/fluid/operators/overlap_add_op.cc b/paddle/fluid/operators/overlap_add_op.cc index 18d88f1069185..108c2df4cd2e1 100644 --- a/paddle/fluid/operators/overlap_add_op.cc +++ b/paddle/fluid/operators/overlap_add_op.cc @@ -186,22 +186,20 @@ REGISTER_OPERATOR(overlap_add_grad, ops::OverlapAddOpGrad); REGISTER_OP_CPU_KERNEL( overlap_add, - ops::OverlapAddKernel, - ops::OverlapAddKernel, - ops::OverlapAddKernel, - ops::OverlapAddKernel, - ops::OverlapAddKernel>, - ops::OverlapAddKernel>); + ops::OverlapAddKernel, + ops::OverlapAddKernel, + ops::OverlapAddKernel, + ops::OverlapAddKernel, + ops::OverlapAddKernel>, + ops::OverlapAddKernel>); REGISTER_OP_CPU_KERNEL( overlap_add_grad, - ops::OverlapAddGradKernel, - ops::OverlapAddGradKernel, - ops::OverlapAddGradKernel, - ops::OverlapAddGradKernel, - ops::OverlapAddGradKernel, + ops::OverlapAddGradKernel, + ops::OverlapAddGradKernel, + ops::OverlapAddGradKernel, + ops::OverlapAddGradKernel>, - ops::OverlapAddGradKernel>); diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc index c512870792073..766ecaee0d6c9 100644 --- a/paddle/fluid/operators/p_norm_op.cc +++ b/paddle/fluid/operators/p_norm_op.cc @@ -112,7 +112,7 @@ class PnormOpGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; DECLARE_INFER_SHAPE_FUNCTOR(p_norm, PNormInferShapeFunctor, diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc index 72073ed3067c3..c2dfb8e61e5eb 100644 --- a/paddle/fluid/operators/pad2d_op.cc +++ b/paddle/fluid/operators/pad2d_op.cc @@ -536,8 +536,8 @@ class Pad2dGradCPUKernel : public framework::OpKernel { auto d_out_dims = d_out->dims(); const T* d_out_data = d_out->data(); T* d_in_data = d_in->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(context.template device_context(), + phi::funcs::SetConstant set_zero; + set_zero(context.template device_context(), d_in, static_cast(0)); const int pad_top = pads[0]; diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index 571ead1710a92..e523c93f5d10b 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -247,19 +247,17 @@ REGISTER_OPERATOR(pad_constant_like, ops::PadConstantLikeOpGradMaker); REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad); -REGISTER_OP_CPU_KERNEL( - pad_constant_like, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel); +REGISTER_OP_CPU_KERNEL(pad_constant_like, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel); REGISTER_OP_CPU_KERNEL( pad_constant_like_grad, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel); + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel); REGISTER_OP_CUDA_KERNEL( pad_constant_like, diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc index 98bbede0323a9..e9b54632ddc01 100644 --- a/paddle/fluid/operators/partial_concat_op.cc +++ b/paddle/fluid/operators/partial_concat_op.cc @@ -211,12 +211,11 @@ REGISTER_OPERATOR(partial_concat, REGISTER_OPERATOR(partial_concat_grad, ops::PartialConcatGradOp); -REGISTER_OP_CPU_KERNEL( - partial_concat, - ops::PartialConcatKernel, - ops::PartialConcatKernel, - ops::PartialConcatKernel, - ops::PartialConcatKernel); +REGISTER_OP_CPU_KERNEL(partial_concat, + ops::PartialConcatKernel, + ops::PartialConcatKernel, + ops::PartialConcatKernel, + ops::PartialConcatKernel); REGISTER_OP_CPU_KERNEL(partial_concat_grad, ops::PartialConcatGradientOpKernel, diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h index f99617fdc634f..affe06f20956a 100644 --- a/paddle/fluid/operators/partial_concat_op.h +++ b/paddle/fluid/operators/partial_concat_op.h @@ -111,8 +111,8 @@ class PartialConcatGradientOpKernel : public framework::OpKernel { auto all_length = grad_batch_len * batch_size; // initialize - auto& place = *ctx.template device_context() - .eigen_device(); + auto& place = + *ctx.template device_context().eigen_device(); for (size_t i = 0; i < outs.size(); ++i) { outs[i]->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*outs[i]); diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc index 3b69efb8e7489..4d4c1e54cff27 100644 --- a/paddle/fluid/operators/partial_sum_op.cc +++ b/paddle/fluid/operators/partial_sum_op.cc @@ -210,12 +210,11 @@ REGISTER_OPERATOR(partial_sum, REGISTER_OPERATOR(partial_sum_grad, ops::PartialSumGradOp); -REGISTER_OP_CPU_KERNEL( - partial_sum, - ops::PartialSumKernel, - ops::PartialSumKernel, - ops::PartialSumKernel, - ops::PartialSumKernel); +REGISTER_OP_CPU_KERNEL(partial_sum, + ops::PartialSumKernel, + ops::PartialSumKernel, + ops::PartialSumKernel, + ops::PartialSumKernel); REGISTER_OP_CPU_KERNEL(partial_sum_grad, ops::PartialSumGradientOpKernel, diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h index b45c4cb9b65c7..58ac0671dde10 100644 --- a/paddle/fluid/operators/partial_sum_op.h +++ b/paddle/fluid/operators/partial_sum_op.h @@ -79,8 +79,8 @@ class PartialSumGradientOpKernel : public framework::OpKernel { } // initialize - auto& place = *ctx.template device_context() - .eigen_device(); + auto& place = + *ctx.template device_context().eigen_device(); for (size_t i = 0; i < outs.size(); ++i) { outs[i]->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*outs[i]); diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc index e40fe2025e281..4f9d1343c8395 100644 --- a/paddle/fluid/operators/pool_op_mlu.cc +++ b/paddle/fluid/operators/pool_op_mlu.cc @@ -122,9 +122,9 @@ class MLUPoolOpKernel : public framework::OpKernel { handle, pool_mode, out_w, out_h, &extra_input_size); if (extra_input_size > 0) { - paddle::platform::CPUDeviceContext cpu_ctx; + phi::CPUContext cpu_ctx; framework::Tensor extra_host_tensor = - ctx.AllocateTmpTensor( + ctx.AllocateTmpTensor( {static_cast(extra_input_size)}, cpu_ctx); cnnlInitPoolingExtraInput(handle, pool_desc.get(), diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc index c9e45fe51cf14..cf8f17d5f747c 100644 --- a/paddle/fluid/operators/prroi_pool_op.cc +++ b/paddle/fluid/operators/prroi_pool_op.cc @@ -200,15 +200,13 @@ REGISTER_OPERATOR(prroi_pool, ops::PRROIPoolGradMaker, ops::PRROIPoolGradMaker); REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp); -REGISTER_OP_CPU_KERNEL( - prroi_pool, - ops::CPUPRROIPoolOpKernel, - ops::CPUPRROIPoolOpKernel, - ops::CPUPRROIPoolOpKernel, - ops::CPUPRROIPoolOpKernel); -REGISTER_OP_CPU_KERNEL( - prroi_pool_grad, - ops::CPUPRROIPoolGradOpKernel, - ops::CPUPRROIPoolGradOpKernel, - ops::CPUPRROIPoolGradOpKernel, - ops::CPUPRROIPoolGradOpKernel); +REGISTER_OP_CPU_KERNEL(prroi_pool, + ops::CPUPRROIPoolOpKernel, + ops::CPUPRROIPoolOpKernel, + ops::CPUPRROIPoolOpKernel, + ops::CPUPRROIPoolOpKernel); +REGISTER_OP_CPU_KERNEL(prroi_pool_grad, + ops::CPUPRROIPoolGradOpKernel, + ops::CPUPRROIPoolGradOpKernel, + ops::CPUPRROIPoolGradOpKernel, + ops::CPUPRROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cc b/paddle/fluid/operators/prune_gate_by_capacity_op.cc index 91223ff0d4813..14494f426d2d0 100644 --- a/paddle/fluid/operators/prune_gate_by_capacity_op.cc +++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cc @@ -128,6 +128,5 @@ REGISTER_OP_WITHOUT_GRADIENT(prune_gate_by_capacity, REGISTER_OP_CPU_KERNEL( prune_gate_by_capacity, - ops::PruneGateByCapacityCPUKernel, - ops::PruneGateByCapacityCPUKernel); + ops::PruneGateByCapacityCPUKernel, + ops::PruneGateByCapacityCPUKernel); diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc index 6f0bb0a39d473..dbdf58637580d 100644 --- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc +++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc @@ -152,5 +152,4 @@ REGISTER_OPERATOR(distributed_lookup_table, REGISTER_OP_CPU_KERNEL( distributed_lookup_table, - ops::DistributedLookupTableKernel); + ops::DistributedLookupTableKernel); diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc index 32326531dd779..a2bf63da10bd2 100644 --- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc +++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc @@ -135,6 +135,5 @@ REGISTER_OPERATOR(distributed_push_sparse, REGISTER_OP_CPU_KERNEL( distributed_push_sparse, - ops::DistributedPushSparseKernel, - ops::DistributedPushSparseKernel); + ops::DistributedPushSparseKernel, + ops::DistributedPushSparseKernel); diff --git a/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc index 9f28bd27f10af..0d0897b0af011 100644 --- a/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc +++ b/paddle/fluid/operators/pscore/heter_cloud_comm_cpu_test.cc @@ -134,7 +134,7 @@ void PressTestSendRecv( int64_t data_size = vars_len; VLOG(0) << "float num: " << data_size; float* data_ptr = new float[data_size]; - file.read((char*)data_ptr, 9437184); + file.read(static_cast(data_ptr), 9437184); VLOG(0) << "send data is: " << data_ptr[0] << ", " << data_ptr[1]; std::vector var_names{"34"}; int loopCnt = 10000; @@ -169,7 +169,7 @@ void PressTestSendRecv( delete[] values; std::ofstream recv("/recv_20_34", std::ios::out | std::ios::binary); - recv.write((char*)values, data_size); + recv.write(static_cast(values, data_size)); recv.close(); t.join(); } @@ -177,7 +177,7 @@ void PressTestSendRecv( void TestScopeSendRecv( std::shared_ptr heter_client_ptr_) { platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); framework::Executor exe(place); std::shared_ptr send_scope_ptr = std::make_shared(); diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc index 1bf0cd598d438..b7267d0c6bc55 100644 --- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc @@ -150,7 +150,7 @@ void RunHeterServerOp(std::string endpoint) { framework::Scope scope; platform::CPUPlace place; framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); LOG(INFO) << "before GetHeterListenAndServProgram"; GetHeterListenAndServProgram(&program, endpoint); @@ -211,7 +211,7 @@ TEST(HETER_LISTEN_AND_SERV, CPU) { framework::Scope* scope = (*micro_scope)[0]; platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); // create var on local scope int64_t rows_numel = 10; diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc index c9cd445c98a14..a0332e857cc4a 100644 --- a/paddle/fluid/operators/pscore/heter_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_server_test.cc @@ -162,7 +162,7 @@ void StartSendAndRecvServer(std::string endpoint) { framework::Scope scope; platform::CPUPlace place; framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); LOG(INFO) << "before AppendSendAndRecvBlock"; auto block = AppendSendAndRecvBlock(&program); std::string in_var_name("x"); @@ -254,7 +254,7 @@ TEST(SENDANDRECV, CPU) { framework::Scope* scope = (*micro_scope)[0]; platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); // create var on local scope int64_t rows_numel = 10; diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc index 71f7cf6a91be5..73eb3f1509223 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc @@ -104,12 +104,11 @@ REGISTER_OP_CUDA_KERNEL( ops::SendAndRecvKernel, ops::SendAndRecvKernel, ops::SendAndRecvKernel); -REGISTER_OP_CPU_KERNEL( - send_and_recv, - ops::SendAndRecvKernel, - ops::SendAndRecvKernel, - ops::SendAndRecvKernel, - ops::SendAndRecvKernel); +REGISTER_OP_CPU_KERNEL(send_and_recv, + ops::SendAndRecvKernel, + ops::SendAndRecvKernel, + ops::SendAndRecvKernel, + ops::SendAndRecvKernel); REGISTER_OP_VERSION(send_and_recv) .AddCheckpoint( diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc index 11ef5cc99e842..61ef001930a04 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc @@ -138,7 +138,7 @@ void StartSendAndRecvServer(std::string endpoint) { framework::Scope scope; platform::CPUPlace place; framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); LOG(INFO) << "before AppendSendAndRecvBlock"; auto block = AppendSendAndRecvBlock(&program); std::string in_var_name("x"); @@ -227,7 +227,7 @@ TEST(SENDANDRECV, CPU) { framework::Scope* scope = (*micro_scope)[0]; platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); framework::Executor exe(place); // create var on local scope diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc index e443439dafe83..8d0d2d3090c17 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc @@ -163,7 +163,7 @@ void StartSendAndRecvServer(std::string endpoint) { framework::Scope scope; platform::CPUPlace place; framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); LOG(INFO) << "before AppendSendAndRecvBlock"; auto block = AppendSendAndRecvBlock(&program); std::string in_var_name("x"); diff --git a/paddle/fluid/operators/pscore/switch_server_test.cc b/paddle/fluid/operators/pscore/switch_server_test.cc index 4af4d4b89275d..a5e6fff4804af 100644 --- a/paddle/fluid/operators/pscore/switch_server_test.cc +++ b/paddle/fluid/operators/pscore/switch_server_test.cc @@ -55,7 +55,7 @@ void StartSwitchInterServer( int main(int argc, char* argv[]) { platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); + phi::CPUContext ctx(place); framework::Executor exe(place); framework::ProgramDesc program; diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc index 512179ba56526..9255a5f164bc4 100644 --- a/paddle/fluid/operators/py_layer_op.cc +++ b/paddle/fluid/operators/py_layer_op.cc @@ -207,23 +207,19 @@ REGISTER_OPERATOR(py_layer, REGISTER_OP_CPU_KERNEL( py_layer, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel, - ops::PyLayerOpKernel>, - ops::PyLayerOpKernel>); + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel, + ops::PyLayerOpKernel>, + ops::PyLayerOpKernel>); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_OP_CUDA_KERNEL( py_layer, diff --git a/paddle/fluid/operators/pyramid_hash_op.cc b/paddle/fluid/operators/pyramid_hash_op.cc index fbda6a13d6592..0dd74f9324fa3 100644 --- a/paddle/fluid/operators/pyramid_hash_op.cc +++ b/paddle/fluid/operators/pyramid_hash_op.cc @@ -586,10 +586,8 @@ REGISTER_OPERATOR(pyramid_hash, ops::PyramidHashGradOpMaker); REGISTER_OPERATOR(pyramid_hash_grad, ops::PyramidHashOpGrad); -REGISTER_OP_CPU_KERNEL( - pyramid_hash, - ops::CPUPyramidHashOPKernel, - ops::CPUPyramidHashOPKernel); -REGISTER_OP_CPU_KERNEL( - pyramid_hash_grad, - ops::CPUPyramidHashOPGradKernel); +REGISTER_OP_CPU_KERNEL(pyramid_hash, + ops::CPUPyramidHashOPKernel, + ops::CPUPyramidHashOPKernel); +REGISTER_OP_CPU_KERNEL(pyramid_hash_grad, + ops::CPUPyramidHashOPGradKernel); diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc index 4074d0dfc63af..90ace1ba773d1 100644 --- a/paddle/fluid/operators/qr_op.cc +++ b/paddle/fluid/operators/qr_op.cc @@ -124,7 +124,6 @@ REGISTER_OPERATOR(qr, REGISTER_OPERATOR(qr_grad, ops::QrGradOp); -REGISTER_OP_CPU_KERNEL( - qr_grad, - ops::QrGradKernel, - ops::QrGradKernel); +REGISTER_OP_CPU_KERNEL(qr_grad, + ops::QrGradKernel, + ops::QrGradKernel); diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc index 4580acbe3fc83..65be8acaa5525 100644 --- a/paddle/fluid/operators/quantize_linear_op.cc +++ b/paddle/fluid/operators/quantize_linear_op.cc @@ -25,8 +25,8 @@ namespace paddle { namespace operators { template -struct ChannelDequantizeFunctorV2 { - void operator()(const platform::CPUDeviceContext &dev_ctx, +struct ChannelDequantizeFunctorV2 { + void operator()(const phi::CPUContext &dev_ctx, const framework::Tensor *in, const framework::Tensor *scale, T max_range, @@ -72,8 +72,8 @@ struct ChannelDequantizeFunctorV2 { } }; -template struct ChannelDequantizeFunctorV2; -template struct ChannelDequantizeFunctorV2; +template struct ChannelDequantizeFunctorV2; +template struct ChannelDequantizeFunctorV2; class QuantizeLinearOp : public framework::OperatorWithKernel { public: @@ -176,7 +176,7 @@ In above three formulas, the range value of c is as follow: } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +using CPU = phi::CPUContext; REGISTER_OPERATOR( quantize_linear, diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc index d7c30142ee778..b86cd9538acea 100644 --- a/paddle/fluid/operators/random_crop_op.cc +++ b/paddle/fluid/operators/random_crop_op.cc @@ -98,7 +98,7 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker); template -using Kernel = ops::RandomCropKernel; +using Kernel = ops::RandomCropKernel; REGISTER_OP_CPU_KERNEL(random_crop, Kernel, Kernel, diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index 78841dae77fb6..aee430b50579d 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -30,7 +30,7 @@ template struct Random; template <> -struct Random { +struct Random { using Engine = std::minstd_rand; template @@ -218,7 +218,7 @@ class RandomCropKernel : public framework::OpKernel { for_range(functor); - Random::Engine engine(seed); + Random::Engine engine(seed); engine.discard(functor.prod_batchsize_dims_ * (functor.rank_ - functor.num_batchsize_dims_)); *ctx.Output("SeedOut")->mutable_data( diff --git a/paddle/fluid/operators/rank_attention_op.cc b/paddle/fluid/operators/rank_attention_op.cc index 4c97b9bf5bd1c..716fc58d4187b 100644 --- a/paddle/fluid/operators/rank_attention_op.cc +++ b/paddle/fluid/operators/rank_attention_op.cc @@ -194,10 +194,9 @@ REGISTER_OPERATOR(rank_attention_grad, ops::RankAttentionGradOp, ops::RankAttentionGradOpNoNeedBufferVarsInference); -REGISTER_OP_CPU_KERNEL( - rank_attention, - ops::RankAttentionKernel, - ops::RankAttentionKernel); +REGISTER_OP_CPU_KERNEL(rank_attention, + ops::RankAttentionKernel, + ops::RankAttentionKernel); REGISTER_OP_VERSION(rank_attention) .AddCheckpoint( diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index 49d6424394ab7..edf82d00950ae 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -240,11 +240,9 @@ REGISTER_OPERATOR(rank_loss, ops::RankLossGradMaker, ops::RankLossGradMaker); REGISTER_OPERATOR(rank_loss_grad, ops::RankLossGradOp); -REGISTER_OP_CPU_KERNEL( - rank_loss, ops::RankLossKernel); -REGISTER_OP_CPU_KERNEL( - rank_loss_grad, - ops::RankLossGradKernel); +REGISTER_OP_CPU_KERNEL(rank_loss, ops::RankLossKernel); +REGISTER_OP_CPU_KERNEL(rank_loss_grad, + ops::RankLossGradKernel); REGISTER_OP_CUDA_KERNEL( rank_loss, diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc index fce8c51f003d3..36776cebfcd46 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.cc @@ -17,26 +17,14 @@ REGISTER_REDUCE_OP(reduce_amax); REGISTER_OP_CPU_KERNEL( reduce_amax, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL(reduce_amax_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL( + reduce_amax_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops:: + ReduceGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc index a6c4cb5510529..bb99ca9b17e7e 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.cc @@ -17,26 +17,14 @@ REGISTER_REDUCE_OP(reduce_amin); REGISTER_OP_CPU_KERNEL( reduce_amin, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL(reduce_amin_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel, + ops::ReduceKernel); +REGISTER_OP_CPU_KERNEL( + reduce_amin_grad, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops::ReduceGradKernel, + ops:: + ReduceGradKernel); diff --git a/paddle/fluid/operators/repeat_interleave_op.cc b/paddle/fluid/operators/repeat_interleave_op.cc index 2ed43cf8f0ea4..f3bec9489fdb0 100644 --- a/paddle/fluid/operators/repeat_interleave_op.cc +++ b/paddle/fluid/operators/repeat_interleave_op.cc @@ -172,16 +172,14 @@ REGISTER_OPERATOR(repeat_interleave, REGISTER_OPERATOR(repeat_interleave_grad, ops::RepeatInterleaveGradOp, ops::RepeatInterleaveGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - repeat_interleave, - ops::RepeatInterleaveKernel, - ops::RepeatInterleaveKernel, - ops::RepeatInterleaveKernel, - ops::RepeatInterleaveKernel); +REGISTER_OP_CPU_KERNEL(repeat_interleave, + ops::RepeatInterleaveKernel, + ops::RepeatInterleaveKernel, + ops::RepeatInterleaveKernel, + ops::RepeatInterleaveKernel); REGISTER_OP_CPU_KERNEL( repeat_interleave_grad, - ops::RepeatInterleaveGradKernel, - ops::RepeatInterleaveGradKernel, - ops::RepeatInterleaveGradKernel, - ops::RepeatInterleaveGradKernel); + ops::RepeatInterleaveGradKernel, + ops::RepeatInterleaveGradKernel, + ops::RepeatInterleaveGradKernel, + ops::RepeatInterleaveGradKernel); diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index ec9c1198996c1..b665cce096207 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -420,7 +420,7 @@ class ReshapeKernel { pt_scalar_shape = phi::IntArray(shape_attr); } if (platform::is_cpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.device_context(); + auto &dev_ctx = ctx.device_context(); phi::ReshapeKernel(static_cast(dev_ctx), *in, pt_scalar_shape, @@ -455,7 +455,7 @@ class ReshapeGradKernel { d_x->mutable_data(ctx.GetPlace(), d_out->type()); if (platform::is_cpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.device_context(); + auto &dev_ctx = ctx.device_context(); phi::ReshapeGradKernel( static_cast(dev_ctx), *d_out, d_x); } @@ -485,7 +485,7 @@ class ReshapeDoubleGradKernel { dd_out->mutable_data(ctx.GetPlace(), dd_x->type()); if (platform::is_cpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.device_context(); + auto &dev_ctx = ctx.device_context(); phi::ReshapeDoubleGradKernel( static_cast(dev_ctx), *d_out, *dd_x, dd_out); } diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc index 1cf72e320ffad..fc39d174c90ae 100644 --- a/paddle/fluid/operators/row_conv_op.cc +++ b/paddle/fluid/operators/row_conv_op.cc @@ -140,8 +140,7 @@ the design document }; template -class RowConvKernel - : public framework::OpKernel { +class RowConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *x = context.Input("X"); @@ -216,8 +215,7 @@ class RowConvKernel }; template -class RowConvGradKernel - : public framework::OpKernel { +class RowConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *x = context.Input("X"); @@ -353,8 +351,6 @@ REGISTER_OPERATOR(row_conv, ops::RowConvGradOpMaker, ops::RowConvGradOpMaker); REGISTER_OPERATOR(row_conv_grad, ops::RowConvGradOp); -REGISTER_OP_CPU_KERNEL( - row_conv, ops::RowConvKernel); -REGISTER_OP_CPU_KERNEL( - row_conv_grad, - ops::RowConvGradKernel); +REGISTER_OP_CPU_KERNEL(row_conv, ops::RowConvKernel); +REGISTER_OP_CPU_KERNEL(row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc index 99ad2328b77cd..fd400d2913670 100644 --- a/paddle/fluid/operators/run_program_op.cc +++ b/paddle/fluid/operators/run_program_op.cc @@ -232,9 +232,7 @@ REGISTER_OPERATOR(run_program, REGISTER_OPERATOR(run_program_grad, ops::RunProgramGradOp); /* see [Why use single type kernel] */ -REGISTER_OP_CPU_KERNEL( - run_program, - ops::RunProgramOpKernel) -REGISTER_OP_CPU_KERNEL( - run_program_grad, - ops::RunProgramGradOpKernel) +REGISTER_OP_CPU_KERNEL(run_program, + ops::RunProgramOpKernel) +REGISTER_OP_CPU_KERNEL(run_program_grad, + ops::RunProgramGradOpKernel) diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h index d4e862f26cd6d..d6affde0ce022 100644 --- a/paddle/fluid/operators/sample_logits_op.h +++ b/paddle/fluid/operators/sample_logits_op.h @@ -244,8 +244,7 @@ class SampleLogitsKernel : public framework::OpKernel { context.Attr("remove_accidental_hits"); // device contexts - auto& dev_ctx = - context.template device_context(); + auto& dev_ctx = context.template device_context(); // UNDERSTAND: allocate memories for temporaries sampled_logits->mutable_data(samples_dim, context.GetPlace()); @@ -278,8 +277,7 @@ class SampleLogitsKernel : public framework::OpKernel { probabilities->mutable_data(samples_dim, context.GetPlace()); // UNDERSTAND: sampling const auto seed = context.Attr("seed"); - auto sampler_with_prob = - math::SampleWithProb(); + auto sampler_with_prob = math::SampleWithProb(); sampler_with_prob(dev_ctx, math::LogUniformSampler(num_classes, seed), num_samples, @@ -315,9 +313,8 @@ class SampleLogitsGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("SampledLogits")); logits_grad->mutable_data(context.GetPlace()); - auto& dev_ctx = - context.template device_context(); - phi::funcs::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + phi::funcs::SetConstant set_zero; set_zero(dev_ctx, logits_grad, static_cast(0)); // UNDERSTAND: scatter it back to logit_grad diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index 385b092c4bcc0..6b5c2367bb9ad 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -104,9 +104,8 @@ REGISTER_OPERATOR(save_combine, REGISTER_OP_CPU_KERNEL( save_combine, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel, - ops::SaveCombineOpKernel); + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index 0ff381bdbab3f..f269c4aa32dea 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -90,14 +90,12 @@ REGISTER_OPERATOR(save, REGISTER_OP_CPU_KERNEL( save, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel, - ops::SaveOpKernel); + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel); diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc index 93f2d60e5f232..1249e3e807ec7 100644 --- a/paddle/fluid/operators/scatter_test.cc +++ b/paddle/fluid/operators/scatter_test.cc @@ -42,7 +42,7 @@ TEST(scatter, ScatterUpdate) { } auto* cpu_place = new paddle::platform::CPUPlace(); - paddle::platform::CPUDeviceContext ctx(*cpu_place); + phi::CPUContext ctx(*cpu_place); phi::funcs::ScatterAssign(ctx, src, index, &output); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f); diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h index 32aa7442f5199..07cd48604b8aa 100644 --- a/paddle/fluid/operators/search_compute.h +++ b/paddle/fluid/operators/search_compute.h @@ -63,7 +63,7 @@ void call_gemm(const framework::ExecutionContext& ctx, T* C) { int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; - auto blas = phi::funcs::GetBlas(ctx); + auto blas = phi::funcs::GetBlas(ctx); blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); } diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc index 1364d4c1d2ae9..527884ec9c9b6 100644 --- a/paddle/fluid/operators/seed_op.cc +++ b/paddle/fluid/operators/seed_op.cc @@ -72,8 +72,7 @@ REGISTER_OPERATOR( ops::SeedOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - seed, ops::CPUSeedKernel); +REGISTER_OP_CPU_KERNEL(seed, ops::CPUSeedKernel); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(seed).AddCheckpoint( diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu index 449bd694ceb46..9b1d7a27e58e4 100644 --- a/paddle/fluid/operators/seed_op.cu +++ b/paddle/fluid/operators/seed_op.cu @@ -32,8 +32,8 @@ class GPUSeedKernel : public framework::OpKernel { platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(platform::CPUPlace()); out->mutable_data(platform::CPUPlace()); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), + phi::funcs::SetConstant functor; + functor(reinterpret_cast(dev_ctx), out, static_cast(seed)); } else { diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc index 9b7bd3fd6c6ab..117fc4ebe0c36 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc @@ -141,19 +141,17 @@ REGISTER_OPERATOR(sequence_concat, op::SeqConcatOpMaker, op::SeqConcatGradOpMaker, op::SeqConcatGradOpMaker); -REGISTER_OP_CPU_KERNEL( - sequence_concat, - op::SeqConcatKernel, - op::SeqConcatKernel, - op::SeqConcatKernel, - op::SeqConcatKernel); +REGISTER_OP_CPU_KERNEL(sequence_concat, + op::SeqConcatKernel, + op::SeqConcatKernel, + op::SeqConcatKernel, + op::SeqConcatKernel); REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp, op::SeqConcatGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - sequence_concat_grad, - op::SeqConcatGradKernel, - op::SeqConcatGradKernel, - op::SeqConcatGradKernel, - op::SeqConcatGradKernel); +REGISTER_OP_CPU_KERNEL(sequence_concat_grad, + op::SeqConcatGradKernel, + op::SeqConcatGradKernel, + op::SeqConcatGradKernel, + op::SeqConcatGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc index dced2038eb680..f1350ce334b41 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc @@ -268,11 +268,9 @@ REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp, ops::SequenceConvGradNoNeedBufferVarsInference); -REGISTER_OP_CPU_KERNEL( - sequence_conv, - ops::SequenceConvKernel, - ops::SequenceConvKernel); -REGISTER_OP_CPU_KERNEL( - sequence_conv_grad, - ops::SequenceConvGradKernel, - ops::SequenceConvGradKernel); +REGISTER_OP_CPU_KERNEL(sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); +REGISTER_OP_CPU_KERNEL(sequence_conv_grad, + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc index a06ed8b02d110..de55f1ab52a35 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc @@ -88,7 +88,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(sequence_enumerate, ops::SequenceEnumerateOp, ops::SequenceEnumerateOpMaker); -REGISTER_OP_CPU_KERNEL( - sequence_enumerate, - ops::SequenceEnumerateKernel, - ops::SequenceEnumerateKernel); +REGISTER_OP_CPU_KERNEL(sequence_enumerate, + ops::SequenceEnumerateKernel, + ops::SequenceEnumerateKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc index e763635b7f419..c64b568e533d0 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cc @@ -97,7 +97,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp, ops::SequenceEraseOpMaker); -REGISTER_OP_CPU_KERNEL( - sequence_erase, - ops::SequenceEraseKernel, - ops::SequenceEraseKernel); +REGISTER_OP_CPU_KERNEL(sequence_erase, + ops::SequenceEraseKernel, + ops::SequenceEraseKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc index 4135f046c21e2..5c3731fc90253 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc @@ -208,16 +208,14 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad, ops::SequenceExpandAsGradOpNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - sequence_expand_as, - ops::SequenceExpandAsKernel, - ops::SequenceExpandAsKernel, - ops::SequenceExpandAsKernel, - ops::SequenceExpandAsKernel); +REGISTER_OP_CPU_KERNEL(sequence_expand_as, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel, + ops::SequenceExpandAsKernel); REGISTER_OP_CPU_KERNEL( sequence_expand_as_grad, - ops::SequenceExpandAsGradKernel, - ops::SequenceExpandAsGradKernel, - ops::SequenceExpandAsGradKernel, - ops::SequenceExpandAsGradKernel); + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel, + ops::SequenceExpandAsGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h index da9ad3574db2f..02d2b87874d05 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h @@ -43,9 +43,9 @@ struct SequenceExpandAsGradFunctor { }; template -struct SequenceExpandAsFunctor { +struct SequenceExpandAsFunctor { void operator()( - const platform::CPUDeviceContext &context, + const phi::CPUContext &context, const framework::LoDTensor &x, const framework::Vector &ref_lod, /*expand referenced lod*/ framework::LoDTensor *out) { @@ -121,9 +121,9 @@ class SequenceExpandAsKernel : public framework::OpKernel { * * */ template -struct SequenceExpandAsGradFunctor { +struct SequenceExpandAsGradFunctor { void operator()( - const platform::CPUDeviceContext &context, + const phi::CPUContext &context, const framework::LoDTensor &dout, const framework::Vector &ref_lod, /*expand referenced lod*/ framework::LoDTensor *dx) { diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc index e9e7912fe5036..a2fb088975e39 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc @@ -281,15 +281,13 @@ REGISTER_OPERATOR(sequence_expand, REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad, ops::SequenceExpandGradOpNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - sequence_expand, - ops::SequenceExpandKernel, - ops::SequenceExpandKernel, - ops::SequenceExpandKernel, - ops::SequenceExpandKernel); -REGISTER_OP_CPU_KERNEL( - sequence_expand_grad, - ops::SequenceExpandGradKernel, - ops::SequenceExpandGradKernel, - ops::SequenceExpandGradKernel, - ops::SequenceExpandGradKernel); +REGISTER_OP_CPU_KERNEL(sequence_expand, + ops::SequenceExpandKernel, + ops::SequenceExpandKernel, + ops::SequenceExpandKernel, + ops::SequenceExpandKernel); +REGISTER_OP_CPU_KERNEL(sequence_expand_grad, + ops::SequenceExpandGradKernel, + ops::SequenceExpandGradKernel, + ops::SequenceExpandGradKernel, + ops::SequenceExpandGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h index b6cd8c3b9079a..158aa0e4fe190 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h @@ -49,9 +49,9 @@ struct SequenceExpandGradFunctor { }; template -struct SequenceExpandFunctor { +struct SequenceExpandFunctor { void operator()( - const platform::CPUDeviceContext& context, + const phi::CPUContext& context, const LoDTensor& x, const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand referenced lod*/ @@ -161,9 +161,9 @@ class SequenceExpandKernel : public framework::OpKernel { * * */ template -struct SequenceExpandGradFunctor { +struct SequenceExpandGradFunctor { void operator()( - const platform::CPUDeviceContext& context, + const phi::CPUContext& context, const LoDTensor& dout, const framework::Vector& x_lod, /*expand source lod*/ const framework::Vector& ref_lod, /*expand referenced lod*/ @@ -181,7 +181,7 @@ struct SequenceExpandGradFunctor { int dout_end = dout_offset + repeat_num * x_seq_len; auto dout_sub = dout.Slice(dout_offset, dout_end); dout_sub.Resize({repeat_num, dx_sub.dims()[0]}); - phi::funcs::ColwiseSum col_sum; + phi::funcs::ColwiseSum col_sum; col_sum(context, dout_sub, &dx_sub); dout_offset += repeat_num * x_seq_len; } diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc index 6b20338f95eb7..2ed9c44f5928c 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc @@ -103,11 +103,7 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL( sequence_mask, - paddle::operators::SequenceMaskKernel, - paddle::operators::SequenceMaskKernel, - paddle::operators::SequenceMaskKernel, - paddle::operators::SequenceMaskKernel); + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel, + paddle::operators::SequenceMaskKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc index dc04a6cce7abd..ad4876970c532 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc @@ -282,15 +282,13 @@ REGISTER_OPERATOR(sequence_pad, REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp, ops::SequencePadGradOpNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - sequence_pad, - ops::SequencePadOpKernel, - ops::SequencePadOpKernel, - ops::SequencePadOpKernel, - ops::SequencePadOpKernel); -REGISTER_OP_CPU_KERNEL( - sequence_pad_grad, - ops::SequencePadGradOpKernel, - ops::SequencePadGradOpKernel, - ops::SequencePadGradOpKernel, - ops::SequencePadGradOpKernel); +REGISTER_OP_CPU_KERNEL(sequence_pad, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel, + ops::SequencePadOpKernel); +REGISTER_OP_CPU_KERNEL(sequence_pad_grad, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel, + ops::SequencePadGradOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc index 327fdfda5e28f..6c146a699af8b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc @@ -193,12 +193,10 @@ REGISTER_OPERATOR(sequence_pool, REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp, ops::SequencePoolGradOpNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - sequence_pool, - ops::SequencePoolKernel, - ops::SequencePoolKernel); - -REGISTER_OP_CPU_KERNEL( - sequence_pool_grad, - ops::SequencePoolGradKernel, - ops::SequencePoolGradKernel); +REGISTER_OP_CPU_KERNEL(sequence_pool, + ops::SequencePoolKernel, + ops::SequencePoolKernel); + +REGISTER_OP_CPU_KERNEL(sequence_pool_grad, + ops::SequencePoolGradKernel, + ops::SequencePoolGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc index 5266650f2279d..6925267f1a981 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc @@ -144,15 +144,13 @@ REGISTER_OPERATOR(sequence_reshape, ops::SequenceReshapeGradOpMaker, ops::SequenceReshapeGradOpMaker); REGISTER_OPERATOR(sequence_reshape_grad, ops::SequenceReshapeGradOp); -REGISTER_OP_CPU_KERNEL( - sequence_reshape, - ops::SequenceReshapeKernel, - ops::SequenceReshapeKernel, - ops::SequenceReshapeKernel, - ops::SequenceReshapeKernel); -REGISTER_OP_CPU_KERNEL( - sequence_reshape_grad, - ops::SequenceReshapeGradKernel, - ops::SequenceReshapeGradKernel, - ops::SequenceReshapeGradKernel, - ops::SequenceReshapeGradKernel); +REGISTER_OP_CPU_KERNEL(sequence_reshape, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel, + ops::SequenceReshapeKernel); +REGISTER_OP_CPU_KERNEL(sequence_reshape_grad, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel, + ops::SequenceReshapeGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc index f17c2baca9896..d1e8409653a5e 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc @@ -22,10 +22,9 @@ REGISTER_OPERATOR(sequence_reverse, ops::SequenceReverseGradOpMaker, ops::SequenceReverseGradOpMaker); -REGISTER_OP_CPU_KERNEL( - sequence_reverse, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel, - ops::SequenceReverseOpKernel); +REGISTER_OP_CPU_KERNEL(sequence_reverse, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc index a7578e25f93f4..9375cea85c78f 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc @@ -159,16 +159,14 @@ REGISTER_OPERATOR(sequence_slice, REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp, ops::SequenceSliceGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - sequence_slice, - ops::SequenceSliceOpKernel, - ops::SequenceSliceOpKernel, - ops::SequenceSliceOpKernel, - ops::SequenceSliceOpKernel); +REGISTER_OP_CPU_KERNEL(sequence_slice, + ops::SequenceSliceOpKernel, + ops::SequenceSliceOpKernel, + ops::SequenceSliceOpKernel, + ops::SequenceSliceOpKernel); REGISTER_OP_CPU_KERNEL( sequence_slice_grad, - ops::SequenceSliceGradOpKernel, - ops::SequenceSliceGradOpKernel, - ops::SequenceSliceGradOpKernel, - ops::SequenceSliceGradOpKernel); + ops::SequenceSliceGradOpKernel, + ops::SequenceSliceGradOpKernel, + ops::SequenceSliceGradOpKernel, + ops::SequenceSliceGradOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc index 863d2e01d73e5..bb0ad26b51bb4 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc @@ -185,11 +185,9 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(sequence_softmax_grad, ops::SequenceSoftmaxGradOp, ops::SequenceSoftmaxGradOpNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - sequence_softmax, - ops::SequenceSoftmaxKernel, - ops::SequenceSoftmaxKernel); -REGISTER_OP_CPU_KERNEL( - sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel, - ops::SequenceSoftmaxGradKernel); +REGISTER_OP_CPU_KERNEL(sequence_softmax, + ops::SequenceSoftmaxKernel, + ops::SequenceSoftmaxKernel); +REGISTER_OP_CPU_KERNEL(sequence_softmax_grad, + ops::SequenceSoftmaxGradKernel, + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h index 3ebf955fe259a..0d3d3b695af4b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h @@ -41,8 +41,8 @@ struct SequenceSoftmaxGradFunctor { }; template -struct SequenceSoftmaxFunctor { - void operator()(const platform::CPUDeviceContext &ctx, +struct SequenceSoftmaxFunctor { + void operator()(const phi::CPUContext &ctx, const LoDTensor &x, const framework::Vector &ref_lod, /*referenced lod*/ LoDTensor *out) { @@ -63,8 +63,8 @@ struct SequenceSoftmaxFunctor { }; template -struct SequenceSoftmaxGradFunctor { - void operator()(const platform::CPUDeviceContext &ctx, +struct SequenceSoftmaxGradFunctor { + void operator()(const phi::CPUContext &ctx, const LoDTensor &dout, const LoDTensor &out, const framework::Vector &ref_lod, /*referenced lod*/ diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc index 0c312cfb1cf83..b19dfe40ed95e 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc @@ -139,9 +139,7 @@ REGISTER_OPERATOR(sequence_topk_avg_pooling_grad, ops::SequenceTopkAvgPoolingGradOp); REGISTER_OP_CPU_KERNEL( sequence_topk_avg_pooling, - ops::SequenceTopkAvgPoolingKernel); + ops::SequenceTopkAvgPoolingKernel); REGISTER_OP_CPU_KERNEL( sequence_topk_avg_pooling_grad, - ops::SequenceTopkAvgPoolingGradKernel); + ops::SequenceTopkAvgPoolingGradKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h index 04115c69a9a7d..1c1168e449eb7 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h @@ -202,9 +202,8 @@ class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel { auto pos_data = pos_input->data(); auto dout_data = d_out->data(); - auto& dev_ctx = - context.template device_context(); - phi::funcs::SetConstant zero; + auto& dev_ctx = context.template device_context(); + phi::funcs::SetConstant zero; zero(dev_ctx, d_in, static_cast(0.0)); auto din_data = d_in->data(); diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc index 4b90d64d26fe3..613dc8bfbc9b1 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc @@ -194,16 +194,14 @@ REGISTER_OPERATOR(sequence_unpad, REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp, ops::SequenceUnpadGradOpNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - sequence_unpad, - ops::SequenceUnpadOpKernel, - ops::SequenceUnpadOpKernel, - ops::SequenceUnpadOpKernel, - ops::SequenceUnpadOpKernel); +REGISTER_OP_CPU_KERNEL(sequence_unpad, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel, + ops::SequenceUnpadOpKernel); REGISTER_OP_CPU_KERNEL( sequence_unpad_grad, - ops::SequenceUnpadGradOpKernel, - ops::SequenceUnpadGradOpKernel, - ops::SequenceUnpadGradOpKernel, - ops::SequenceUnpadGradOpKernel); + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel, + ops::SequenceUnpadGradOpKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 74896d8499672..4a3668b114059 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -137,13 +137,11 @@ REGISTER_OPERATOR(shuffle_channel, REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp); -REGISTER_OP_CPU_KERNEL( - shuffle_channel, - ops::ShuffleChannelOpKernel, - ops::ShuffleChannelOpKernel); +REGISTER_OP_CPU_KERNEL(shuffle_channel, + ops::ShuffleChannelOpKernel, + ops::ShuffleChannelOpKernel); REGISTER_OP_CPU_KERNEL( shuffle_channel_grad, - ops::ShuffleChannelGradOpKernel, - ops::ShuffleChannelGradOpKernel); + ops::ShuffleChannelGradOpKernel, + ops::ShuffleChannelGradOpKernel); diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 4b6bcae7635b8..4e81226188304 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -466,31 +466,25 @@ REGISTER_OPERATOR(slice_grad, REGISTER_OP_CPU_KERNEL( slice, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel>, - ops::SliceKernel>, - ops::SliceKernel); + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel>, + ops::SliceKernel>, + ops::SliceKernel); REGISTER_OP_CPU_KERNEL( slice_grad, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel>, - ops::SliceGradKernel>, - ops::SliceGradKernel); + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel>, + ops::SliceGradKernel>, + ops::SliceGradKernel); REGISTER_OP_CUDA_KERNEL( slice, diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc index eb391fd3fb73c..f8bebe331d8be 100644 --- a/paddle/fluid/operators/smooth_l1_loss_op.cc +++ b/paddle/fluid/operators/smooth_l1_loss_op.cc @@ -225,9 +225,7 @@ REGISTER_OPERATOR(smooth_l1_loss, ops::SmoothL1LossGradMaker, ops::SmoothL1LossGradMaker); REGISTER_OPERATOR(smooth_l1_loss_grad, ops::SmoothL1LossGradOp); -REGISTER_OP_CPU_KERNEL( - smooth_l1_loss, - ops::SmoothL1LossKernel); -REGISTER_OP_CPU_KERNEL( - smooth_l1_loss_grad, - ops::SmoothL1LossGradKernel); +REGISTER_OP_CPU_KERNEL(smooth_l1_loss, + ops::SmoothL1LossKernel); +REGISTER_OP_CPU_KERNEL(smooth_l1_loss_grad, + ops::SmoothL1LossGradKernel); diff --git a/paddle/fluid/operators/solve_op.cc b/paddle/fluid/operators/solve_op.cc index 77a45684aca0e..a7bf413e10519 100644 --- a/paddle/fluid/operators/solve_op.cc +++ b/paddle/fluid/operators/solve_op.cc @@ -221,11 +221,9 @@ REGISTER_OPERATOR(solve, REGISTER_OPERATOR(solve_grad, ops::SolveGradOp); -REGISTER_OP_CPU_KERNEL( - solve, - ops::SolveKernel, - ops::SolveKernel); -REGISTER_OP_CPU_KERNEL( - solve_grad, - ops::SolveGradKernel, - ops::SolveGradKernel); +REGISTER_OP_CPU_KERNEL(solve, + ops::SolveKernel, + ops::SolveKernel); +REGISTER_OP_CPU_KERNEL(solve_grad, + ops::SolveGradKernel, + ops::SolveGradKernel); diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc index fb428594226a9..dce7539fe72b8 100644 --- a/paddle/fluid/operators/space_to_depth_op.cc +++ b/paddle/fluid/operators/space_to_depth_op.cc @@ -226,15 +226,13 @@ REGISTER_OPERATOR(space_to_depth, REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp, ops::SpaceToDepthGradOpNoBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - space_to_depth, - ops::SpaceToDepthKernel, - ops::SpaceToDepthKernel, - ops::SpaceToDepthKernel, - ops::SpaceToDepthKernel); -REGISTER_OP_CPU_KERNEL( - space_to_depth_grad, - ops::SpaceToDepthGradKernel, - ops::SpaceToDepthGradKernel, - ops::SpaceToDepthGradKernel, - ops::SpaceToDepthGradKernel); +REGISTER_OP_CPU_KERNEL(space_to_depth, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel); +REGISTER_OP_CPU_KERNEL(space_to_depth_grad, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel); diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h index d66cc503307e6..f69573e18927e 100644 --- a/paddle/fluid/operators/spectral_helper.h +++ b/paddle/fluid/operators/spectral_helper.h @@ -210,7 +210,7 @@ void exec_fft(const DeviceContext& ctx, transposed_input.Resize(transposed_input_shape); const auto place = ctx.GetPlace(); transposed_input.mutable_data(place); - TransCompute( + TransCompute( ndim, ctx, *x, &transposed_input, dim_permute); // make an collapsed input: collapse batch axes for input @@ -310,39 +310,39 @@ void exec_fft(const DeviceContext& ctx, for (int i = 0; i < ndim; i++) { reverse_dim_permute[dim_permute[i]] = i; } - TransCompute( + TransCompute( ndim, ctx, transposed_output, out, reverse_dim_permute); } template -struct FFTC2CFunctor { - void operator()(const platform::CPUDeviceContext& ctx, +struct FFTC2CFunctor { + void operator()(const phi::CPUContext& ctx, const Tensor* x, Tensor* out, const std::vector& axes, FFTNormMode normalization, bool forward) { - exec_fft( + exec_fft( ctx, x, out, axes, normalization, forward); } }; template -struct FFTR2CFunctor { - void operator()(const platform::CPUDeviceContext& ctx, +struct FFTR2CFunctor { + void operator()(const phi::CPUContext& ctx, const Tensor* x, Tensor* out, const std::vector& axes, FFTNormMode normalization, bool forward) { - exec_fft( + exec_fft( ctx, x, out, axes, normalization, forward); } }; template -struct FFTC2RFunctor { - void operator()(const platform::CPUDeviceContext& ctx, +struct FFTC2RFunctor { + void operator()(const phi::CPUContext& ctx, const Tensor* x, Tensor* out, const std::vector& axes, @@ -353,14 +353,14 @@ struct FFTC2RFunctor { Tensor temp; temp.mutable_data(x->dims(), ctx.GetPlace()); - FFTC2CFunctor c2c_functor; + FFTC2CFunctor c2c_functor; c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward); const std::vector new_axes{axes.back()}; - exec_fft( + exec_fft( ctx, &temp, out, new_axes, normalization, forward); } else { - exec_fft( + exec_fft( ctx, x, out, axes, normalization, forward); } } @@ -383,8 +383,8 @@ T compute_factor(int64_t size, FFTNormMode normalization) { } template -struct FFTC2CFunctor { - void operator()(const platform::CPUDeviceContext& ctx, +struct FFTC2CFunctor { + void operator()(const phi::CPUContext& ctx, const Tensor* x, Tensor* out, const std::vector& axes, @@ -426,8 +426,8 @@ struct FFTC2CFunctor { }; template -struct FFTR2CFunctor { - void operator()(const platform::CPUDeviceContext& ctx, +struct FFTR2CFunctor { + void operator()(const phi::CPUContext& ctx, const Tensor* x, Tensor* out, const std::vector& axes, @@ -483,8 +483,8 @@ struct FFTR2CFunctor { }; template -struct FFTC2RFunctor { - void operator()(const platform::CPUDeviceContext& ctx, +struct FFTC2RFunctor { + void operator()(const phi::CPUContext& ctx, const Tensor* x, Tensor* out, const std::vector& axes, diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc index 1af812c336b5a..a6addb2e6f46d 100644 --- a/paddle/fluid/operators/spectral_norm_op.cc +++ b/paddle/fluid/operators/spectral_norm_op.cc @@ -256,11 +256,9 @@ REGISTER_OPERATOR(spectral_norm, ops::SpectralNormGradOpMaker, ops::SpectralNormGradOpMaker); REGISTER_OPERATOR(spectral_norm_grad, ops::SpectralNormOpGrad); -REGISTER_OP_CPU_KERNEL( - spectral_norm, - ops::SpectralNormKernel, - ops::SpectralNormKernel); -REGISTER_OP_CPU_KERNEL( - spectral_norm_grad, - ops::SpectralNormGradKernel, - ops::SpectralNormGradKernel); +REGISTER_OP_CPU_KERNEL(spectral_norm, + ops::SpectralNormKernel, + ops::SpectralNormKernel); +REGISTER_OP_CPU_KERNEL(spectral_norm_grad, + ops::SpectralNormGradKernel, + ops::SpectralNormGradKernel); diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc index 3f00333b98089..91e3880dff004 100644 --- a/paddle/fluid/operators/spectral_op.cc +++ b/paddle/fluid/operators/spectral_op.cc @@ -351,45 +351,39 @@ REGISTER_OPERATOR(fft_c2c, ops::FFTC2COpMaker, ops::FFTC2CGradOpMaker, ops::FFTC2CGradOpMaker); -REGISTER_OP_CPU_KERNEL( - fft_c2c, - ops::FFTC2CKernel, - ops::FFTC2CKernel); +REGISTER_OP_CPU_KERNEL(fft_c2c, + ops::FFTC2CKernel, + ops::FFTC2CKernel); REGISTER_OPERATOR(fft_c2c_grad, ops::FFTC2CGradOp); -REGISTER_OP_CPU_KERNEL( - fft_c2c_grad, - ops::FFTC2CGradKernel, - ops::FFTC2CGradKernel); +REGISTER_OP_CPU_KERNEL(fft_c2c_grad, + ops::FFTC2CGradKernel, + ops::FFTC2CGradKernel); REGISTER_OPERATOR(fft_r2c, ops::FFTR2COp, ops::FFTR2COpMaker, ops::FFTR2CGradOpMaker, ops::FFTR2CGradOpMaker); -REGISTER_OP_CPU_KERNEL( - fft_r2c, - ops::FFTR2CKernel, - ops::FFTR2CKernel); +REGISTER_OP_CPU_KERNEL(fft_r2c, + ops::FFTR2CKernel, + ops::FFTR2CKernel); REGISTER_OPERATOR(fft_r2c_grad, ops::FFTR2CGradOp); -REGISTER_OP_CPU_KERNEL( - fft_r2c_grad, - ops::FFTR2CGradKernel, - ops::FFTR2CGradKernel); +REGISTER_OP_CPU_KERNEL(fft_r2c_grad, + ops::FFTR2CGradKernel, + ops::FFTR2CGradKernel); REGISTER_OPERATOR(fft_c2r, ops::FFTC2ROp, ops::FFTC2ROpMaker, ops::FFTC2RGradOpMaker, ops::FFTC2RGradOpMaker); -REGISTER_OP_CPU_KERNEL( - fft_c2r, - ops::FFTC2RKernel, - ops::FFTC2RKernel); +REGISTER_OP_CPU_KERNEL(fft_c2r, + ops::FFTC2RKernel, + ops::FFTC2RKernel); REGISTER_OPERATOR(fft_c2r_grad, ops::FFTC2RGradOp); -REGISTER_OP_CPU_KERNEL( - fft_c2r_grad, - ops::FFTC2RGradKernel, - ops::FFTC2RGradKernel); +REGISTER_OP_CPU_KERNEL(fft_c2r_grad, + ops::FFTC2RGradKernel, + ops::FFTC2RGradKernel); diff --git a/paddle/fluid/operators/spp_op.cc b/paddle/fluid/operators/spp_op.cc index 583b0b69a63cf..b1ca67f521816 100644 --- a/paddle/fluid/operators/spp_op.cc +++ b/paddle/fluid/operators/spp_op.cc @@ -109,11 +109,9 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker); REGISTER_OPERATOR(spp_grad, ops::SppOpGrad); -REGISTER_OP_CPU_KERNEL( - spp, - ops::SppKernel, - ops::SppKernel); -REGISTER_OP_CPU_KERNEL( - spp_grad, - ops::SppGradKernel, - ops::SppGradKernel); +REGISTER_OP_CPU_KERNEL(spp, + ops::SppKernel, + ops::SppKernel); +REGISTER_OP_CPU_KERNEL(spp_grad, + ops::SppGradKernel, + ops::SppGradKernel); diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc index 2b76a42706535..55d307cf087ec 100644 --- a/paddle/fluid/operators/squared_l2_distance_op.cc +++ b/paddle/fluid/operators/squared_l2_distance_op.cc @@ -221,10 +221,8 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp, ops::SquaredL2DistanceGradOpNoBufferVarsInferer); -REGISTER_OP_CPU_KERNEL( - squared_l2_distance, - ops::SquaredL2DistanceKernel); +REGISTER_OP_CPU_KERNEL(squared_l2_distance, + ops::SquaredL2DistanceKernel); REGISTER_OP_CPU_KERNEL( squared_l2_distance_grad, - ops::SquaredL2DistanceGradKernel); + ops::SquaredL2DistanceGradKernel); diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc index 529c4262b0a08..f6792baa1f591 100644 --- a/paddle/fluid/operators/squared_l2_norm_op.cc +++ b/paddle/fluid/operators/squared_l2_norm_op.cc @@ -96,11 +96,9 @@ REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormGradOpMaker, ops::SquaredL2NormGradOpMaker); REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp); -REGISTER_OP_CPU_KERNEL( - squared_l2_norm, - ops::SquaredL2NormKernel, - ops::SquaredL2NormKernel); -REGISTER_OP_CPU_KERNEL( - squared_l2_norm_grad, - ops::SquaredL2NormGradKernel, - ops::SquaredL2NormGradKernel); +REGISTER_OP_CPU_KERNEL(squared_l2_norm, + ops::SquaredL2NormKernel, + ops::SquaredL2NormKernel); +REGISTER_OP_CPU_KERNEL(squared_l2_norm_grad, + ops::SquaredL2NormGradKernel, + ops::SquaredL2NormGradKernel); diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index 29c54d2699aff..f532a429b49e2 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -375,31 +375,25 @@ REGISTER_OPERATOR(squeeze2_grad, REGISTER_OP_CPU_KERNEL( squeeze, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel, - ops::SqueezeKernel>, - ops::SqueezeKernel>, - ops::SqueezeKernel); + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel>, + ops::SqueezeKernel>, + ops::SqueezeKernel); REGISTER_OP_CPU_KERNEL( squeeze_grad, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel, - ops::SqueezeGradKernel>, - ops::SqueezeGradKernel>, - ops::SqueezeGradKernel); + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel>, + ops::SqueezeGradKernel>, + ops::SqueezeGradKernel); diff --git a/paddle/fluid/operators/stft_op.cc b/paddle/fluid/operators/stft_op.cc index fd064c255f560..d708abe3d0f8c 100644 --- a/paddle/fluid/operators/stft_op.cc +++ b/paddle/fluid/operators/stft_op.cc @@ -164,12 +164,10 @@ REGISTER_OPERATOR(stft, REGISTER_OPERATOR(stft_grad, ops::StftGradOp); -REGISTER_OP_CPU_KERNEL( - stft, - ops::StftKernel, - ops::StftKernel); - -REGISTER_OP_CPU_KERNEL( - stft_grad, - ops::StftGradKernel, - ops::StftGradKernel); +REGISTER_OP_CPU_KERNEL(stft, + ops::StftKernel, + ops::StftKernel); + +REGISTER_OP_CPU_KERNEL(stft_grad, + ops::StftGradKernel, + ops::StftGradKernel); diff --git a/paddle/fluid/operators/strided_memcpy_test.cc b/paddle/fluid/operators/strided_memcpy_test.cc index 362be0d5da33d..e16df34542795 100644 --- a/paddle/fluid/operators/strided_memcpy_test.cc +++ b/paddle/fluid/operators/strided_memcpy_test.cc @@ -35,7 +35,7 @@ TEST(StridedMemcpy, CPUCrop) { framework::DDim dst_dim({2, 2}); framework::DDim dst_stride({2, 1}); - platform::CPUDeviceContext ctx; + phi::CPUContext ctx; StridedMemcpy(ctx, src + 1, src_stride, dst_dim, dst_stride, dst); ASSERT_EQ(1, dst[0]); @@ -57,7 +57,7 @@ TEST(StridedMemcpy, CPUConcat) { framework::DDim src_stride({2, 1}); framework::DDim dst_dim({2, 2}); framework::DDim dst_stride({4, 1}); - platform::CPUDeviceContext ctx; + phi::CPUContext ctx; StridedMemcpy(ctx, src, src_stride, dst_dim, dst_stride, dst); StridedMemcpy(ctx, src, src_stride, dst_dim, dst_stride, dst + 2); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index fca510143d0de..ca851b8ee75b1 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -359,9 +359,8 @@ REGISTER_OPERATOR(sum, REGISTER_OP_CPU_KERNEL( sum, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel); + ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/fluid/operators/svd_op.cc b/paddle/fluid/operators/svd_op.cc index 4186068cd6e40..7ae85343e0472 100644 --- a/paddle/fluid/operators/svd_op.cc +++ b/paddle/fluid/operators/svd_op.cc @@ -172,7 +172,6 @@ REGISTER_OP_CPU_KERNEL(svd, ops::SvdCPUKernel, ops::SvdCPUKernel); -REGISTER_OP_CPU_KERNEL( - svd_grad, - ops::SvdGradKernel, - ops::SvdGradKernel); +REGISTER_OP_CPU_KERNEL(svd_grad, + ops::SvdGradKernel, + ops::SvdGradKernel); diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h index 7b98dc21d07bb..b7d3b7d3e5ae0 100644 --- a/paddle/fluid/operators/svd_op.h +++ b/paddle/fluid/operators/svd_op.h @@ -40,10 +40,10 @@ class SvdCPUKernel : public framework::OpKernel { /*Create Tensors and output, set the dim ...*/ auto numel = x->numel(); - auto& orig_dev_ctx = - context.template device_context(); - auto& dev_ctx = static_cast::TYPE&>(orig_dev_ctx); + auto& orig_dev_ctx = context.template device_context(); + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); Tensor trans_x = ::phi::TransposeLast2Dim(dev_ctx, *x); auto* x_data = trans_x.data(); auto x_dims = x->dims(); diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h index cd0842e4a47bf..6df883e83337f 100644 --- a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h +++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h @@ -103,10 +103,9 @@ static bool TestLeakyReluGradGradMain(const framework::DDim &dim, for_range(actual_functor); } else { #endif - auto &cpu_dev_ctx = dynamic_cast(dev_ctx); + auto &cpu_dev_ctx = dynamic_cast(dev_ctx); functor(cpu_dev_ctx, &x, out, &ddx, &ddout, dout, dx); - platform::ForRange for_range(cpu_dev_ctx, - limit); + platform::ForRange for_range(cpu_dev_ctx, limit); for_range(actual_functor); #if defined(__NVCC__) || defined(__HIPCC__) } diff --git a/paddle/fluid/operators/tree_conv_op.cc b/paddle/fluid/operators/tree_conv_op.cc index f62e1d3072fa3..525dd17c39bb9 100644 --- a/paddle/fluid/operators/tree_conv_op.cc +++ b/paddle/fluid/operators/tree_conv_op.cc @@ -234,12 +234,10 @@ REGISTER_OPERATOR(tree_conv, REGISTER_OPERATOR(tree_conv_grad, ops::TreeConvGradOp); -REGISTER_OP_CPU_KERNEL( - tree_conv, - ops::TreeConvKernel, - ops::TreeConvKernel); - -REGISTER_OP_CPU_KERNEL( - tree_conv_grad, - ops::TreeConvGradKernel, - ops::TreeConvGradKernel); +REGISTER_OP_CPU_KERNEL(tree_conv, + ops::TreeConvKernel, + ops::TreeConvKernel); + +REGISTER_OP_CPU_KERNEL(tree_conv_grad, + ops::TreeConvGradKernel, + ops::TreeConvGradKernel); diff --git a/paddle/fluid/operators/unique_consecutive_op.cc b/paddle/fluid/operators/unique_consecutive_op.cc index e9c6a4edb66c5..73f6918d52598 100644 --- a/paddle/fluid/operators/unique_consecutive_op.cc +++ b/paddle/fluid/operators/unique_consecutive_op.cc @@ -118,12 +118,11 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(unique_consecutive, ops::UniqueConsecutiveOp, ops::UniqueConsecutiveOpMaker); -REGISTER_OP_CPU_KERNEL( - unique_consecutive, - ops::UniqueConsecutiveKernel, - ops::UniqueConsecutiveKernel, - ops::UniqueConsecutiveKernel, - ops::UniqueConsecutiveKernel); +REGISTER_OP_CPU_KERNEL(unique_consecutive, + ops::UniqueConsecutiveKernel, + ops::UniqueConsecutiveKernel, + ops::UniqueConsecutiveKernel, + ops::UniqueConsecutiveKernel); REGISTER_OP_VERSION(unique_consecutive) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc index 82e6b734aa009..47679ca57f5bf 100644 --- a/paddle/fluid/operators/unpool_op.cc +++ b/paddle/fluid/operators/unpool_op.cc @@ -328,14 +328,12 @@ REGISTER_OPERATOR(unpool, ops::UnpoolOpGradMaker); REGISTER_OPERATOR(unpool_grad, ops::UnpoolOpGrad); -REGISTER_OP_CPU_KERNEL( - unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); -REGISTER_OP_CPU_KERNEL( - unpool_grad, - ops::UnpoolGradKernel, - ops::UnpoolGradKernel); +REGISTER_OP_CPU_KERNEL(unpool, + ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CPU_KERNEL(unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); REGISTER_OPERATOR(unpool3d, ops::Unpool3dOp, @@ -344,11 +342,9 @@ REGISTER_OPERATOR(unpool3d, ops::Unpool3dOpGradMaker); REGISTER_OPERATOR(unpool3d_grad, ops::Unpool3dOpGrad); -REGISTER_OP_CPU_KERNEL( - unpool3d, - ops::Unpool3dKernel, - ops::Unpool3dKernel); -REGISTER_OP_CPU_KERNEL( - unpool3d_grad, - ops::Unpool3dGradKernel, - ops::Unpool3dGradKernel); +REGISTER_OP_CPU_KERNEL(unpool3d, + ops::Unpool3dKernel, + ops::Unpool3dKernel); +REGISTER_OP_CPU_KERNEL(unpool3d_grad, + ops::Unpool3dGradKernel, + ops::Unpool3dGradKernel); diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc index 18a1d5435e014..53de6440f1f61 100644 --- a/paddle/fluid/operators/unsqueeze_op.cc +++ b/paddle/fluid/operators/unsqueeze_op.cc @@ -378,33 +378,28 @@ REGISTER_OPERATOR(unsqueeze2_grad, REGISTER_OP_CPU_KERNEL( unsqueeze, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel, - ops::UnsqueezeKernel>, - ops::UnsqueezeKernel>, - ops::UnsqueezeKernel); + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel>, + ops::UnsqueezeKernel>, + ops::UnsqueezeKernel); REGISTER_OP_CPU_KERNEL( unsqueeze_grad, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel, - ops::UnsqueezeGradKernel>, - ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel>, + ops::UnsqueezeGradKernel>, - ops::UnsqueezeGradKernel); + ops::UnsqueezeGradKernel); diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc index d46d2bd847341..53feefef3e1cc 100644 --- a/paddle/fluid/operators/var_conv_2d_op.cc +++ b/paddle/fluid/operators/var_conv_2d_op.cc @@ -321,7 +321,7 @@ class CPUVarConv2dOPKernel : public framework::OpKernel { auto* w_data = w->data(); auto* col_data = col->data(); - auto blas = phi::funcs::GetBlas(ctx); + auto blas = phi::funcs::GetBlas(ctx); for (int b = 0; b < batch; ++b) { int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; if (top_im_size == 0) { @@ -479,7 +479,7 @@ class CPUVarConv2dOPGradKernel : public framework::OpKernel { int batch = x->lod()[0].size() - 1; const auto& top_offset = out->lod()[0]; const auto& col_offset = col->lod()[0]; - auto blas = phi::funcs::GetBlas(ctx); + auto blas = phi::funcs::GetBlas(ctx); for (int b = 0; b < batch; ++b) { int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; if (top_im_size == 0) { @@ -526,11 +526,10 @@ REGISTER_OPERATOR(var_conv_2d, REGISTER_OPERATOR(var_conv_2d_grad, ops::VarConv2dOpGrad); REGISTER_OP_CPU_KERNEL(var_conv_2d, - ops::CPUVarConv2dOPKernel); -// ops::CPUVarConv2dOPKernel); +// ops::CPUVarConv2dOPKernel -REGISTER_OP_CPU_KERNEL( - var_conv_2d_grad, - ops::CPUVarConv2dOPGradKernel); -// ops::CPUVarConv2dOPGradKernel); +// ops::CPUVarConv2dOPGradKernel diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc index e6a847758bdee..99828a425517b 100644 --- a/paddle/fluid/platform/device/npu/npu_op_runner.cc +++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc @@ -233,7 +233,7 @@ NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor, aclMemType mem_type) { NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = - static_cast(pool.Get(platform::CPUPlace())); + static_cast(pool.Get(platform::CPUPlace())); Tensor host_tensor; paddle::framework::TensorFromVector(dims, *dev_ctx, &host_tensor); host_tensors_.emplace_back(host_tensor); @@ -249,7 +249,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = - static_cast(pool.Get(platform::CPUPlace())); + static_cast(pool.Get(platform::CPUPlace())); Tensor host_tensor; paddle::framework::TensorFromVector(dims, *dev_ctx, &host_tensor); host_tensors_.emplace_back(host_tensor); @@ -265,7 +265,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { NpuOpRunner &NpuOpRunner::AddInput(std::vector &&values) { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = - static_cast(pool.Get(platform::CPUPlace())); + static_cast(pool.Get(platform::CPUPlace())); Tensor host_tensor; paddle::framework::TensorFromVector(values, *dev_ctx, &host_tensor); host_tensors_.emplace_back(host_tensor); @@ -281,7 +281,7 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector &&values) { NpuOpRunner &NpuOpRunner::AddInput(std::vector &&values) { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = - static_cast(pool.Get(platform::CPUPlace())); + static_cast(pool.Get(platform::CPUPlace())); Tensor host_tensor; paddle::framework::TensorFromVector(values, *dev_ctx, &host_tensor); host_tensors_.emplace_back(host_tensor); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index ec7f46cd973d4..4dfeca3bd1325 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -261,7 +261,7 @@ void EmplaceDeviceContexts( p, disable_setting_default_stream_for_allocator); #else - EmplaceDeviceContext( + EmplaceDeviceContext( place_to_device_context, p, disable_setting_default_stream_for_allocator); @@ -751,7 +751,7 @@ const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; } #ifdef PADDLE_WITH_MKLDNN MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) - : CPUDeviceContext(place), p_blobmap_() { + : phi::CPUContext(place), p_blobmap_() { p_blobmap_.reset(new BlobMap()); p_exec_items_.reset(new ExecShape()); p_mutex_.reset(new std::mutex()); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 2c3bc017635dd..1b7aafdac6f29 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -133,7 +133,6 @@ constexpr DeviceType kIPU = DeviceType::IPU; constexpr DeviceType kMLU = DeviceType::MLU; using DeviceContext = phi::DeviceContext; - using CPUDeviceContext = phi::CPUContext; template @@ -141,7 +140,7 @@ struct DefaultDeviceContextType; template <> struct DefaultDeviceContextType { - using TYPE = CPUDeviceContext; + using TYPE = phi::CPUContext; }; // Graphcore IPU @@ -776,7 +775,7 @@ class MKLDNNDeviceContextThreadLocals { } }; -class MKLDNNDeviceContext : public CPUDeviceContext { +class MKLDNNDeviceContext : public phi::CPUContext { public: template using BlobPtr_t = std::shared_ptr; diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu index 1caa2e8770772..5e0717ba635ce 100644 --- a/paddle/fluid/platform/transform_test.cu +++ b/paddle/fluid/platform/transform_test.cu @@ -39,17 +39,17 @@ class Multiply { using paddle::memory::Alloc; using paddle::memory::Copy; -using paddle::platform::CPUDeviceContext; using paddle::platform::CPUPlace; using paddle::platform::CUDADeviceContext; using paddle::platform::CUDAPlace; +using phi::CPUContext; using paddle::platform::Transform; TEST(Transform, CPUUnary) { - CPUDeviceContext ctx; + CPUContext ctx; float buf[4] = {0.1, 0.2, 0.3, 0.4}; - Transform trans; + Transform trans; trans(ctx, buf, buf + 4, buf, Scale(10)); for (int i = 0; i < 4; ++i) { ASSERT_NEAR(buf[i], static_cast(i + 1), 1e-5); @@ -78,8 +78,8 @@ TEST(Transform, GPUUnary) { TEST(Transform, CPUBinary) { int buf[4] = {1, 2, 3, 4}; - Transform trans; - CPUDeviceContext ctx; + Transform trans; + phi::CPUContext ctx; trans(ctx, buf, buf + 4, buf, buf, Multiply()); for (int i = 0; i < 4; ++i) { ASSERT_EQ((i + 1) * (i + 1), buf[i]); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 7b7e9d1a6c9ed..3723e58e52902 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2089,7 +2089,7 @@ All parameter, weight, gradient are variables in Paddle. .def_static("create", [](paddle::platform::CPUPlace& place) -> paddle::platform::DeviceContext* { - auto* context = new paddle::platform::CPUDeviceContext(); + auto* context = new phi::CPUContext(); context->SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() .GetAllocator(place) diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index bba8526abd7f9..ccec0c060a3a4 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -676,7 +676,7 @@ void SetUVATensorFromPyArray( template void _sliceCompute(const framework::Tensor *in, framework::Tensor *out, - const platform::CPUDeviceContext &ctx, + const phi::CPUContext &ctx, const std::vector &axes, const std::vector &starts) { auto &eigen_place = *ctx.eigen_device(); @@ -711,7 +711,7 @@ void _sliceCompute(const framework::Tensor *in, template void _concatCompute(const std::vector &ins, paddle::framework::Tensor *out, - const platform::CPUDeviceContext &ctx, + const phi::CPUContext &ctx, int64_t axis) { if (axis == 0 && ins.size() < 10) { size_t output_offset = 0; @@ -729,8 +729,7 @@ void _concatCompute(const std::vector &ins, output_offset += in_stride[axis]; } } else { - paddle::operators::math::ConcatFunctor - concat_functor; + paddle::operators::math::ConcatFunctor concat_functor; concat_functor(ctx, ins, static_cast(axis), out); } } @@ -817,7 +816,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self, template void _sliceDapper(const framework::Tensor *in, framework::Tensor *out, - const platform::CPUDeviceContext &ctx, + const phi::CPUContext &ctx, const std::vector &axes, const std::vector &starts, int size) { @@ -858,7 +857,7 @@ void _sliceDapper(const framework::Tensor *in, template inline framework::Tensor *_sliceWrapper(const framework::Tensor &self, - const platform::CPUDeviceContext &ctx, + const phi::CPUContext &ctx, py::object obj, int dim, int64_t start, @@ -876,7 +875,7 @@ template inline framework::Tensor *_sliceAndConcat(const framework::Tensor &self, py::object obj, int dim) { - platform::CPUDeviceContext ctx; + phi::CPUContext ctx; int64_t start, stop, step, slicelength; _getSliceinfo(self, obj, dim, &start, &stop, &step, &slicelength); if (step == 1 || slicelength == 1) { diff --git a/paddle/phi/kernels/funcs/gru_compute.cc b/paddle/phi/kernels/funcs/gru_compute.cc index c081a9ed97d1f..f0c946134906b 100644 --- a/paddle/phi/kernels/funcs/gru_compute.cc +++ b/paddle/phi/kernels/funcs/gru_compute.cc @@ -19,8 +19,8 @@ namespace phi { namespace funcs { template -struct GRUUnitFunctor { - static void compute(const paddle::platform::CPUDeviceContext &context, +struct GRUUnitFunctor { + static void compute(const phi::CPUContext &context, GRUMetaValue value, int frame_size, int batch_size, @@ -28,8 +28,7 @@ struct GRUUnitFunctor { const phi::funcs::detail::ActivationType active_gate, bool origin_mode) { #if !defined(__NVCC__) && !defined(__HIPCC___) - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value) { blas.GEMM(false, false, @@ -46,7 +45,7 @@ struct GRUUnitFunctor { frame_size * 3); } - detail::forward_reset_output( + detail::forward_reset_output( phi::funcs::detail::forward::gru_resetOutput(), value, frame_size, @@ -71,7 +70,7 @@ struct GRUUnitFunctor { frame_size * 3); } - detail::forward_final_output( + detail::forward_final_output( phi::funcs::detail::forward::gru_finalOutput(), value, frame_size, @@ -85,8 +84,8 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const paddle::platform::CPUDeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const phi::CPUContext &context, GRUMetaValue value, GRUMetaGrad grad, int frame_size, @@ -103,8 +102,7 @@ struct GRUUnitGradFunctor { batch_size, active_node, origin_mode); - auto blas = - phi::funcs::GetBlas(context); + auto blas = phi::funcs::GetBlas(context); if (value.prev_out_value && grad.prev_out_grad) { blas.GEMM(false, true, @@ -356,10 +354,10 @@ struct GRUUnitGradFunctorV2 { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; template struct GRUUnitFunctorV2; template struct GRUUnitFunctorV2; diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc index 042b333ad451a..61cedb57faf27 100644 --- a/paddle/phi/kernels/funcs/math_function.cc +++ b/paddle/phi/kernels/funcs/math_function.cc @@ -257,8 +257,8 @@ template struct RowwiseMean; template struct RowwiseMean; template -struct ElementwiseAddTo { - void operator()(paddle::platform::CPUDeviceContext* ctx, +struct ElementwiseAddTo { + void operator()(phi::CPUContext* ctx, const paddle::framework::Tensor& src, paddle::framework::Tensor* dst) { auto in = paddle::framework::EigenVector::Flatten(src); @@ -268,14 +268,12 @@ struct ElementwiseAddTo { } }; -template struct ElementwiseAddTo; -template struct ElementwiseAddTo; +template struct ElementwiseAddTo; +template struct ElementwiseAddTo; template -struct RowwiseAdd { - void operator()(const paddle::platform::CPUDeviceContext& context, +struct RowwiseAdd { + void operator()(const phi::CPUContext& context, const paddle::framework::Tensor& input, const paddle::framework::Tensor& vector, paddle::framework::Tensor* output) { @@ -312,8 +310,8 @@ struct RowwiseAdd { } }; -template struct RowwiseAdd; -template struct RowwiseAdd; +template struct RowwiseAdd; +template struct RowwiseAdd; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/math_function_impl.h b/paddle/phi/kernels/funcs/math_function_impl.h index 7c337e6c0dba9..f9055fb56c913 100644 --- a/paddle/phi/kernels/funcs/math_function_impl.h +++ b/paddle/phi/kernels/funcs/math_function_impl.h @@ -92,9 +92,9 @@ void ColwiseSum::operator()( // colwise-sum can be easily implemented. General reduce has a huge overhead in // CPU template -class ColwiseSum { +class ColwiseSum { public: - void operator()(const paddle::platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const paddle::framework::Tensor& input, paddle::framework::Tensor* out) { auto& in_dims = input.dims(); @@ -155,9 +155,9 @@ void RowwiseMean::operator()( // rowwise-sum can be easily implemented. General reduce has a huge overhead in // CPU template -class RowwiseMean { +class RowwiseMean { public: - void operator()(const paddle::platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const paddle::framework::Tensor& input, paddle::framework::Tensor* out) { auto& in_dims = input.dims(); @@ -222,9 +222,9 @@ void RowwiseSum::operator()( // rowwise-sum can be easily implemented. General reduce has a huge overhead in // CPU template -class RowwiseSum { +class RowwiseSum { public: - void operator()(const paddle::platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const paddle::framework::Tensor& input, paddle::framework::Tensor* out) { auto& in_dims = input.dims(); diff --git a/paddle/phi/kernels/funcs/sequence2batch.cc b/paddle/phi/kernels/funcs/sequence2batch.cc index 0d75ba877db5e..7cad5b6c0b929 100644 --- a/paddle/phi/kernels/funcs/sequence2batch.cc +++ b/paddle/phi/kernels/funcs/sequence2batch.cc @@ -18,9 +18,9 @@ namespace phi { namespace funcs { template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const paddle::platform::CPUDeviceContext& context, + void operator()(const phi::CPUContext& context, const paddle::framework::Tensor& src, paddle::framework::Vector index_lod, paddle::framework::Tensor* dst, @@ -68,18 +68,13 @@ class CopyMatrixRowsFunctor { } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu index 95334ac36a608..e985f1c417de3 100644 --- a/paddle/phi/tests/common/test_scalar.cu +++ b/paddle/phi/tests/common/test_scalar.cu @@ -164,6 +164,7 @@ TEST(Scalar, ConstructFromDenseTensor7) { .GetAllocator(phi::GPUPlace()) .get()); dev_ctx.Init(); + auto* dense_x_data = dev_ctx.Alloc(&dense_x); FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); dev_ctx.Wait(); diff --git a/paddle/phi/tests/kernels/test_math_function.cc b/paddle/phi/tests/kernels/test_math_function.cc index a13a8cb564f94..b21cf0203febe 100644 --- a/paddle/phi/tests/kernels/test_math_function.cc +++ b/paddle/phi/tests/kernels/test_math_function.cc @@ -20,9 +20,9 @@ namespace phi { namespace tests { template -inline phi::funcs::BlasT GetBlas( - const paddle::platform::CPUDeviceContext& context) { - return phi::funcs::GetBlas(context); +inline phi::funcs::BlasT GetBlas( + const phi::CPUContext& context) { + return phi::funcs::GetBlas(context); } TEST(math_function, gemm_notrans_cblas) { @@ -44,7 +44,7 @@ TEST(math_function, gemm_notrans_cblas) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - paddle::platform::CPUDeviceContext context(*cpu_place); + phi::CPUContext context(*cpu_place); GetBlas(context).GEMM(false, false, m, @@ -165,7 +165,7 @@ TEST(math_function, gemm_trans_cblas) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - paddle::platform::CPUDeviceContext context(*cpu_place); + phi::CPUContext context(*cpu_place); GetBlas(context).GEMM(false, true, m, @@ -196,8 +196,8 @@ TEST(math_function, zero) { paddle::framework::Tensor tensor; auto* cpu_place = new paddle::platform::CPUPlace(); float* t = tensor.mutable_data({2, 2}, *cpu_place); - paddle::platform::CPUDeviceContext context(*cpu_place); - phi::funcs::SetConstant functor; + phi::CPUContext context(*cpu_place); + phi::funcs::SetConstant functor; functor(context, &tensor, 0); EXPECT_EQ(t[0], 0); EXPECT_EQ(t[1], 0); @@ -231,7 +231,7 @@ void GemvTest(int m, int n, bool trans) { data_b[i] = static_cast(i); } - paddle::platform::CPUDeviceContext context(*cpu_place); + phi::CPUContext context(*cpu_place); GetBlas(context).GEMV(trans, static_cast(m), static_cast(n), @@ -272,7 +272,7 @@ TEST(math_funciton, set_constant) { paddle::framework::Tensor t; t.Resize({10, 10}); t.mutable_data(paddle::platform::CPUPlace()); - auto* ctx = new paddle::platform::CPUDeviceContext(); + auto* ctx = new phi::CPUContext(); phi::funcs::set_constant(*ctx, &t, 10); for (int64_t i = 0; i < t.numel(); ++i) { PADDLE_ENFORCE_EQ(10, @@ -311,7 +311,7 @@ void GemmWarpTest(int m, int n, int k, T alpha, T beta) { } // this would call gemm_warp - paddle::platform::CPUDeviceContext context(*cpu_place); + phi::CPUContext context(*cpu_place); GetBlas(context).GEMM( CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B, beta, CREF); diff --git a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h index 9cec48f9c99b5..ffe89fde0470e 100644 --- a/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h +++ b/python/paddle/fluid/tests/custom_op/custom_raw_op_kernel_op.h @@ -66,7 +66,7 @@ struct ReluFunctor { return; } #endif - LAUNCH_RELU_KERNEL(paddle::platform::CPUDeviceContext); + LAUNCH_RELU_KERNEL(phi::CPUContext); #undef LAUNCH_RELU_KERNEL } From f1e61f04c6be0ab1c4e85e3a65dc47a0a369ae9a Mon Sep 17 00:00:00 2001 From: jack603047588 <603047588@qq.com> Date: Mon, 4 Jul 2022 10:19:06 +0800 Subject: [PATCH 039/250] fix vlog print problem in fleet (#44011) * fix vlog print problem in fleet * fix log_patch.h code style --- paddle/fluid/framework/fleet/fleet_wrapper.h | 1 + .../framework/fleet/heter_ps/log_patch.h | 34 +++++++++++++++++++ paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 1 + 3 files changed, 36 insertions(+) create mode 100644 paddle/fluid/framework/fleet/heter_ps/log_patch.h diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index 982c1b85a5b03..c9c03fb66f8fa 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -38,6 +38,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include "paddle/fluid/platform/device/gpu/gpu_types.h" #endif +#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/log_patch.h b/paddle/fluid/framework/fleet/heter_ps/log_patch.h new file mode 100644 index 0000000000000..84c83a56f3061 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/log_patch.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#undef VLOG_IS_ON +#define VLOG_IS_ON(verboselevel) (FLAGS_v >= (verboselevel)) + +#undef COMPACT_GOOGLE_LOG_INFO +#define COMPACT_GOOGLE_LOG_INFO google::LogMessage(__FILE__, __LINE__) + +#undef LOG +#define LOG(severity) COMPACT_GOOGLE_LOG_##severity.stream() + +#undef LOG_IF +#define LOG_IF(severity, condition) \ + static_cast(0), \ + !(condition) ? (void)0 : google::LogMessageVoidify() & LOG(severity) + +#undef VLOG +#define VLOG(verboselevel) LOG_IF(INFO, VLOG_IS_ON(verboselevel)) diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index 0e816beef0d33..fae30a45d2e5b 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -59,6 +59,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_PSLIB #include "downpour_accessor.h" // NOLINT #endif +#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h" namespace paddle { namespace framework { From 957258d968b522307f491e9e040a684f4970c18f Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Mon, 4 Jul 2022 11:03:35 +0800 Subject: [PATCH 040/250] Remove boost::optional and boost::none (#44029) --- .../mkldnn/conv_transpose_mkldnn_op.cc | 1 - .../sequence_ops/sequence_concat_op.h | 1 - paddle/fluid/platform/mkldnn_reuse.h | 1 - paddle/fluid/pybind/reader_py.cc | 2 +- .../host_context/mlir_to_runtime_translate.cc | 99 +++++++++---------- .../host_context/mlir_to_runtime_translate.h | 2 +- 6 files changed, 51 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 7a297b3daefd7..cd81168753bed 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -12,7 +12,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "boost/optional.hpp" #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h index 8d9302fa43b7a..4943e0e2ea09b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h @@ -17,7 +17,6 @@ #include #include -#include "boost/optional.hpp" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/concat_and_split.h" diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 05ebedf611a4b..41a4f551cedc1 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include -#include "boost/optional.hpp" #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/pool_op.h" diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index 9c80bb8a67e63..36c09f543a6c2 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -22,7 +22,7 @@ #include #include "Python.h" -#include "boost/optional.hpp" + #include "gflags/gflags.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/imperative/layer.h" diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index 9292e593a708f..81b41d61ded3e 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -31,7 +31,6 @@ #include #include -#include "boost/optional.hpp" #include "paddle/infrt/common/string.h" #include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/mlir_loader.h" @@ -124,118 +123,118 @@ bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) { } template <> -boost::optional MlirToRuntimeTranslator::EmitAttribute( +paddle::optional MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa()) return boost::none; + if (!attr.isa()) return paddle::none; if (attr.isa()) { auto val = attr.cast(); if (val.getType().isInteger(32)) { return val.getValue().getSExtValue(); } } - return boost::none; + return paddle::none; } template <> -boost::optional MlirToRuntimeTranslator::EmitAttribute( +paddle::optional MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa()) return boost::none; + if (!attr.isa()) return paddle::none; if (attr.isa()) { auto val = attr.cast(); if (val.getType().isInteger(64)) { return val.getValue().getSExtValue(); } } - return boost::none; + return paddle::none; } // TODO(Superjomn) Make double and float parsing share some thing. template <> -boost::optional MlirToRuntimeTranslator::EmitAttribute( +paddle::optional MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa()) return boost::none; + if (!attr.isa()) return paddle::none; if (attr.isa()) { auto val = attr.cast(); if (val.getType().isF32()) return val.getValueAsDouble(); } - return boost::none; + return paddle::none; } template <> -boost::optional MlirToRuntimeTranslator::EmitAttribute( +paddle::optional MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa()) return boost::none; + if (!attr.isa()) return paddle::none; if (attr.isa()) { auto val = attr.cast(); return val.getValue(); } - return boost::none; + return paddle::none; } template <> -boost::optional MlirToRuntimeTranslator::EmitAttribute( +paddle::optional MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa()) return boost::none; + if (!attr.isa()) return paddle::none; if (attr.isa()) { auto val = attr.cast(); if (val.getType().isF64()) return val.getValueAsDouble(); } - return boost::none; + return paddle::none; } template <> -boost::optional<::infrt::TargetType> MlirToRuntimeTranslator::EmitAttribute( +paddle::optional<::infrt::TargetType> MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa<::infrt::TargetAttr>()) return boost::none; + if (!attr.isa<::infrt::TargetAttr>()) return paddle::none; if (attr.isa<::infrt::TargetAttr>()) { return attr.cast<::infrt::TargetAttr>().getTarget(); } - return boost::none; + return paddle::none; } template <> -boost::optional<::infrt::LayoutType> MlirToRuntimeTranslator::EmitAttribute( +paddle::optional<::infrt::LayoutType> MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa<::infrt::LayoutAttr>()) return boost::none; + if (!attr.isa<::infrt::LayoutAttr>()) return paddle::none; if (attr.isa<::infrt::LayoutAttr>()) { return attr.cast<::infrt::LayoutAttr>().getLayout(); } - return boost::none; + return paddle::none; } template <> -boost::optional<::infrt::PrecisionType> MlirToRuntimeTranslator::EmitAttribute( +paddle::optional<::infrt::PrecisionType> MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa<::infrt::PrecisionAttr>()) return boost::none; + if (!attr.isa<::infrt::PrecisionAttr>()) return paddle::none; if (attr.isa<::infrt::PrecisionAttr>()) { return attr.cast<::infrt::PrecisionAttr>().getPrecision(); } - return boost::none; + return paddle::none; } template <> -boost::optional MlirToRuntimeTranslator::EmitAttribute( +paddle::optional MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa()) return boost::none; + if (!attr.isa()) return paddle::none; return attr.cast().getValue().str(); } -#define PROCESS_ARRAY_INT(type__, bits__) \ - template <> \ - boost::optional> MlirToRuntimeTranslator::EmitAttribute( \ - const mlir::Attribute& attr) { \ - if (!attr.isa()) return boost::none; \ - auto array = attr.cast(); \ - CHECK(!array.empty()); \ - \ - if (!array[0].getType().isInteger(bits__)) { \ - return boost::none; \ - } \ - \ - std::vector res; \ - for (auto& v : array) { \ - res.push_back(v.cast().getValue().getSExtValue()); \ - } \ - return res; \ +#define PROCESS_ARRAY_INT(type__, bits__) \ + template <> \ + paddle::optional> \ + MlirToRuntimeTranslator::EmitAttribute(const mlir::Attribute& attr) { \ + if (!attr.isa()) return paddle::none; \ + auto array = attr.cast(); \ + CHECK(!array.empty()); \ + \ + if (!array[0].getType().isInteger(bits__)) { \ + return paddle::none; \ + } \ + \ + std::vector res; \ + for (auto& v : array) { \ + res.push_back(v.cast().getValue().getSExtValue()); \ + } \ + return res; \ } PROCESS_ARRAY_INT(bool, 1); @@ -244,13 +243,13 @@ PROCESS_ARRAY_INT(int32_t, 32); PROCESS_ARRAY_INT(int64_t, 64); template <> -boost::optional> MlirToRuntimeTranslator::EmitAttribute( +paddle::optional> MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa()) return boost::none; + if (!attr.isa()) return paddle::none; auto array = attr.cast(); CHECK(!array.empty()); - if (!array[0].getType().isF32()) return boost::none; + if (!array[0].getType().isF32()) return paddle::none; std::vector res; for (auto& v : array) { @@ -260,13 +259,13 @@ boost::optional> MlirToRuntimeTranslator::EmitAttribute( } template <> -boost::optional> MlirToRuntimeTranslator::EmitAttribute( +paddle::optional> MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { - if (!attr.isa()) return boost::none; + if (!attr.isa()) return paddle::none; auto array = attr.cast(); CHECK(!array.empty()); - if (!array[0].getType().isF64()) return boost::none; + if (!array[0].getType().isF64()) return paddle::none; std::vector res; for (auto& v : array) { diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h index 27a7f20168667..64dc770489c4d 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.h +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h @@ -75,7 +75,7 @@ class MlirToRuntimeTranslator { bool EmitCallOp(mlir::Operation* op, function_defs_t* function_table); template - boost::optional EmitAttribute(const mlir::Attribute& attr); + paddle::optional EmitAttribute(const mlir::Attribute& attr); Value* GetOpResult(mlir::Operation* op); From c5dbcc8c232452415f3a0424efa51e9dcca0ac4f Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Mon, 4 Jul 2022 11:07:00 +0800 Subject: [PATCH 041/250] Remove boost::tribool (#44030) --- .../mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc | 6 +- .../ir/mkldnn/mkldnn_inplace_pass_tester.cc | 8 +- .../ir/mkldnn/mkldnn_placement_pass_tester.cc | 10 +- paddle/utils/tribool.h | 463 ++++++++++++++++++ 4 files changed, 475 insertions(+), 12 deletions(-) create mode 100644 paddle/utils/tribool.h diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index a2b66263aa792..6f7bb614cc79f 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include #include #include +#include "paddle/utils/tribool.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph_traits.h" @@ -52,12 +52,12 @@ class MKLDNNConvBatchNormPassTest { const std::string& name, const std::vector& inputs, const std::vector& outputs, - boost::tribool use_mkldnn) { + paddle::tribool use_mkldnn) { auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); - if (!boost::indeterminate(use_mkldnn)) + if (!paddle::indeterminate(use_mkldnn)) op->SetAttr("use_mkldnn", use_mkldnn); if (type == "conv2d_transpose") { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index f6d318f74fe3e..a3b1f730dfc24 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -14,8 +14,8 @@ #include -#include #include +#include "paddle/utils/tribool.h" #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" @@ -44,12 +44,12 @@ class MKLDNNInplacePassTest { const std::string& name, const std::vector& inputs, const std::vector& outputs, - boost::tribool use_mkldnn) { + paddle::tribool use_mkldnn) { auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); - if (!boost::indeterminate(use_mkldnn)) + if (!paddle::indeterminate(use_mkldnn)) op->SetAttr("use_mkldnn", use_mkldnn); if (type == "conv2d") { @@ -102,7 +102,7 @@ class MKLDNNInplacePassTest { "conv1", std::vector({"a", "weights", "bias"}), std::vector({"f"}), - boost::indeterminate); + paddle::indeterminate); SetOp(&prog, "relu", "relu1", diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc index bb38e6e9091dd..b9c1954dc74e0 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc @@ -14,7 +14,7 @@ #include -#include +#include "paddle/utils/tribool.h" #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" @@ -29,12 +29,12 @@ class PlacementPassTest { const std::string& name, const std::vector& inputs, const std::vector& outputs, - boost::tribool use_mkldnn) { + paddle::tribool use_mkldnn) { auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); - if (!boost::indeterminate(use_mkldnn)) + if (!paddle::indeterminate(use_mkldnn)) op->SetAttr("use_mkldnn", use_mkldnn); if (type == "conv2d") { @@ -90,13 +90,13 @@ class PlacementPassTest { "concat1", std::vector({"a", "b"}), std::vector({"c"}), - boost::indeterminate); + paddle::indeterminate); SetOp(&prog, "conv2d", "conv1", std::vector({"c", "weights", "bias"}), std::vector({"f"}), - boost::indeterminate); + paddle::indeterminate); SetOp(&prog, "relu", "relu1", diff --git a/paddle/utils/tribool.h b/paddle/utils/tribool.h new file mode 100644 index 0000000000000..98a5019d71535 --- /dev/null +++ b/paddle/utils/tribool.h @@ -0,0 +1,463 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file copy from boost/logic/tribool.hpp, boost version: 1.41.0 +// Modified the following points: +// 1. modify namespace from boost to paddle +// 2. remove the depending boost header files +// 3. remove the dummy_ in indeterminate_t, which is specially implemented for +// Borland C++ Builder + +// Three-state boolean logic library + +// Copyright Douglas Gregor 2002-2004. Use, modification and +// distribution is subject to the Boost Software License, Version +// 1.0. (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// For more information, see http://www.boost.org + +#pragma once + +namespace paddle { +namespace logic { + +/// INTERNAL ONLY +namespace detail { +/** + * INTERNAL ONLY + * + * \brief A type used only to uniquely identify the 'indeterminate' + * function/keyword. + */ +struct indeterminate_t {}; + +} // end namespace detail + +class tribool; + +/** + * INTERNAL ONLY + * The type of the 'indeterminate' keyword. This has the same type as the + * function 'indeterminate' so that we can recognize when the keyword is + * used. + */ +typedef bool (*indeterminate_keyword_t)(tribool, detail::indeterminate_t); + +/** + * \brief Keyword and test function for the indeterminate tribool value + * + * The \c indeterminate function has a dual role. It's first role is + * as a unary function that tells whether the tribool value is in the + * "indeterminate" state. It's second role is as a keyword + * representing the indeterminate (just like "true" and "false" + * represent the true and false states). If you do not like the name + * "indeterminate", and would prefer to use a different name, see the + * macro \c BOOST_TRIBOOL_THIRD_STATE. + * + * \returns x.value == tribool::indeterminate_value + * \throws nothrow + */ +inline bool indeterminate( + tribool x, detail::indeterminate_t dummy = detail::indeterminate_t()); + +/** + * \brief A 3-state boolean type. + * + * 3-state boolean values are either true, false, or + * indeterminate. + */ +class tribool { + private: + /// INTERNAL ONLY + struct dummy { + void nonnull() {} + }; + + typedef void (dummy::*safe_bool)(); + + public: + /** + * Construct a new 3-state boolean value with the value 'false'. + * + * \throws nothrow + */ + tribool() : value(false_value) {} + + /** + * Construct a new 3-state boolean value with the given boolean + * value, which may be \c true or \c false. + * + * \throws nothrow + */ + tribool(bool value) : value(value ? true_value : false_value) {} // NOLINT + + /** + * Construct a new 3-state boolean value with an indeterminate value. + * + * \throws nothrow + */ + tribool(indeterminate_keyword_t) : value(indeterminate_value) {} // NOLINT + + /** + * Use a 3-state boolean in a boolean context. Will evaluate true in a + * boolean context only when the 3-state boolean is definitely true. + * + * \returns true if the 3-state boolean is true, false otherwise + * \throws nothrow + */ + operator safe_bool() const { + return value == true_value ? &dummy::nonnull : 0; + } + + /** + * The actual stored value in this 3-state boolean, which may be false, true, + * or indeterminate. + */ + enum value_t { false_value, true_value, indeterminate_value } value; +}; + +// Check if the given tribool has an indeterminate value. Also doubles as a +// keyword for the 'indeterminate' value +inline bool indeterminate(tribool x, detail::indeterminate_t) { + return x.value == tribool::indeterminate_value; +} + +/** @defgroup logical Logical operations + */ +//@{ +/** + * \brief Computes the logical negation of a tribool + * + * \returns the logical negation of the tribool, according to the + * table: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
!
+ *
false
true
true
false
indeterminate
indeterminate
+ * \throws nothrow + */ +inline tribool operator!(tribool x) { + return x.value == tribool::false_value ? tribool(true) + : x.value == tribool::true_value ? tribool(false) + : tribool(indeterminate); +} + +/** + * \brief Computes the logical conjuction of two tribools + * + * \returns the result of logically ANDing the two tribool values, + * according to the following table: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
&&
false
true
indeterminate
false
false
false
false
true
false
true
indeterminate
indeterminate
false
indeterminate
indeterminate
+ * \throws nothrow + */ +inline tribool operator&&(tribool x, tribool y) { + if (static_cast(!x) || static_cast(!y)) + return false; + else if (static_cast(x) && static_cast(y)) + return true; + else + return indeterminate; +} + +/** + * \overload + */ +inline tribool operator&&(tribool x, bool y) { return y ? x : tribool(false); } + +/** + * \overload + */ +inline tribool operator&&(bool x, tribool y) { return x ? y : tribool(false); } + +/** + * \overload + */ +inline tribool operator&&(indeterminate_keyword_t, tribool x) { + return !x ? tribool(false) : tribool(indeterminate); +} + +/** + * \overload + */ +inline tribool operator&&(tribool x, indeterminate_keyword_t) { + return !x ? tribool(false) : tribool(indeterminate); +} + +/** + * \brief Computes the logical disjunction of two tribools + * + * \returns the result of logically ORing the two tribool values, + * according to the following table: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
||
false
true
indeterminate
false
false
true
indeterminate
true
true
true
true
indeterminate
indeterminate
true
indeterminate
+ * \throws nothrow + */ +inline tribool operator||(tribool x, tribool y) { + if (static_cast(!x) && static_cast(!y)) + return false; + else if (static_cast(x) || static_cast(y)) + return true; + else + return indeterminate; +} + +/** + * \overload + */ +inline tribool operator||(tribool x, bool y) { return y ? tribool(true) : x; } + +/** + * \overload + */ +inline tribool operator||(bool x, tribool y) { return x ? tribool(true) : y; } + +/** + * \overload + */ +inline tribool operator||(indeterminate_keyword_t, tribool x) { + return x ? tribool(true) : tribool(indeterminate); +} + +/** + * \overload + */ +inline tribool operator||(tribool x, indeterminate_keyword_t) { + return x ? tribool(true) : tribool(indeterminate); +} +//@} + +/** + * \brief Compare tribools for equality + * + * \returns the result of comparing two tribool values, according to + * the following table: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
==
false
true
indeterminate
false
true
false
indeterminate
true
false
true
indeterminate
indeterminate
indeterminate
indeterminate
indeterminate
+ * \throws nothrow + */ +inline tribool operator==(tribool x, tribool y) { + if (indeterminate(x) || indeterminate(y)) + return indeterminate; + else + return (x && y) || (!x && !y); +} + +/** + * \overload + */ +inline tribool operator==(tribool x, bool y) { return x == tribool(y); } + +/** + * \overload + */ +inline tribool operator==(bool x, tribool y) { return tribool(x) == y; } + +/** + * \overload + */ +inline tribool operator==(indeterminate_keyword_t, tribool x) { + return tribool(indeterminate) == x; +} + +/** + * \overload + */ +inline tribool operator==(tribool x, indeterminate_keyword_t) { + return tribool(indeterminate) == x; +} + +/** + * \brief Compare tribools for inequality + * + * \returns the result of comparing two tribool values for inequality, + * according to the following table: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
!=
false
true
indeterminate
false
false
true
indeterminate
true
true
false
indeterminate
indeterminate
indeterminate
indeterminate
indeterminate
+ * \throws nothrow + */ +inline tribool operator!=(tribool x, tribool y) { + if (indeterminate(x) || indeterminate(y)) + return indeterminate; + else + return !((x && y) || (!x && !y)); +} + +/** + * \overload + */ +inline tribool operator!=(tribool x, bool y) { return x != tribool(y); } + +/** + * \overload + */ +inline tribool operator!=(bool x, tribool y) { return tribool(x) != y; } + +/** + * \overload + */ +inline tribool operator!=(indeterminate_keyword_t, tribool x) { + return tribool(indeterminate) != x; +} + +/** + * \overload + */ +inline tribool operator!=(tribool x, indeterminate_keyword_t) { + return x != tribool(indeterminate); +} + +} // namespace logic +} // namespace paddle + +// Pull tribool and indeterminate into namespace "boost" +namespace paddle { +using logic::indeterminate; +using logic::tribool; +} // namespace paddle + +/** + * \brief Declare a new name for the third state of a tribool + * + * Use this macro to declare a new name for the third state of a + * tribool. This state can have any number of new names (in addition + * to \c indeterminate), all of which will be equivalent. The new name will be + * placed in the namespace in which the macro is expanded. + * + * Example: + * PADDLE_TRIBOOL_THIRD_STATE(true_or_false) + * + * tribool x(true_or_false); + * // potentially set x + * if (true_or_false(x)) { + * // don't know what x is + * } + */ +#define PADDLE_TRIBOOL_THIRD_STATE(Name) \ + inline bool Name(boost::logic::tribool x, \ + boost::logic::detail::indeterminate_t dummy = \ + boost::logic::detail::indeterminate_t()) { \ + return x.value == boost::logic::tribool::indeterminate_value; \ + } From cd00d9b40aa09612c0c623408c1d269d3555f618 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Mon, 4 Jul 2022 11:07:46 +0800 Subject: [PATCH 042/250] Remove boost::blank (#44028) --- paddle/fluid/framework/attribute.cc | 4 +- paddle/fluid/framework/ir/generate_pass.cc | 6 +-- paddle/fluid/framework/op_desc.cc | 4 +- paddle/fluid/framework/type_defs.h | 6 +-- .../fluid/platform/device/ipu/ipu_compiler.cc | 8 +-- paddle/phi/api/ext/op_meta_info.h | 2 +- paddle/phi/core/enforce.cc | 5 +- paddle/utils/blank.h | 52 +++++++++++++++++++ 8 files changed, 69 insertions(+), 18 deletions(-) create mode 100644 paddle/utils/blank.h diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc index ed50d5f6bfc4f..a2d0f2db2829d 100644 --- a/paddle/fluid/framework/attribute.cc +++ b/paddle/fluid/framework/attribute.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/attribute.h" -#include "boost/blank.hpp" +#include "paddle/utils/blank.h" namespace paddle { namespace framework { @@ -118,7 +118,7 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) { PADDLE_THROW(platform::errors::Unavailable("Unsupport attribute type %d.", attr_desc.type())); } - return boost::blank(); + return paddle::blank(); } } // namespace framework diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc index 83c3ab9933d61..160304784a9fa 100644 --- a/paddle/fluid/framework/ir/generate_pass.cc +++ b/paddle/fluid/framework/ir/generate_pass.cc @@ -14,8 +14,8 @@ #include "paddle/fluid/framework/ir/generate_pass.h" -#include "boost/blank.hpp" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/utils/blank.h" namespace paddle { namespace framework { @@ -40,7 +40,7 @@ class element_visitor : public boost::static_visitor { if (index >= 0 && static_cast(index) < attr.size()) { return static_cast(attr[index]); } - return boost::blank(); + return paddle::blank(); } private: @@ -99,7 +99,7 @@ Attribute GetVarAttrValue(const VarDesc* desc, return shape; } } - return boost::blank(); + return paddle::blank(); } Attribute GetOpAttrValue(const OpDesc* desc, diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index c8a9950ae5efb..169722b971b29 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include -#include "boost/blank.hpp" #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_call_stack.h" @@ -24,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/utils/blank.h" namespace paddle { namespace framework { @@ -810,7 +810,7 @@ struct SetAttrDescVisitor : public boost::static_visitor { VectorToRepeated(v, attr_->mutable_float64s()); } - void operator()(boost::blank) const { + void operator()(paddle::blank) const { PADDLE_THROW(platform::errors::Unavailable( "Unsupported calling method of SetAttrDescVisitor object for " "`boosst::blank` type.")); diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index ca9d6ec44a8d9..5c768b10a3d7e 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -22,9 +22,9 @@ limitations under the License. */ #include #include -#include "boost/blank.hpp" #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/platform/variant.h" +#include "paddle/utils/blank.h" #include "paddle/utils/small_vector.h" #include "paddle/utils/variant.h" @@ -42,7 +42,7 @@ class InferNoNeedBufferVarsFN; using VariableNameMap = std::map>; using VariableValueMap = std::map>; -using Attribute = paddle::variant; #ifdef PADDLE_WITH_ASCEND_CL -using NPUAttribute = paddle::variant #include -#include "boost/blank.hpp" +#include "paddle/utils/blank.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/device/ipu/ipu_names.h" @@ -75,9 +75,9 @@ struct CustomOpAttrVisitor : public boost::static_visitor { void operator()(const std::vector& v) const { attrs_->emplace(attr_name_, v); } - void operator()(boost::blank) const { + void operator()(paddle::blank) const { PADDLE_THROW(platform::errors::Unavailable( - "Unsupported calling method for `boost::blank` type when extracting " + "Unsupported calling method for `paddle::blank` type when extracting " "custom operator attributes.")); } }; @@ -124,7 +124,7 @@ struct ConstantOpAttrVisitor : public boost::static_visitor { void operator()(BlockDesc* desc) const { RAISE_ERROR; } void operator()(const std::vector& v) const { RAISE_ERROR; } void operator()(int64_t v) const { RAISE_ERROR; } - void operator()(boost::blank) const { RAISE_ERROR; } + void operator()(paddle::blank) const { RAISE_ERROR; } #undef RAISE_ERROR }; diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h index fa19714dde7db..546b0accf8ba7 100644 --- a/paddle/phi/api/ext/op_meta_info.h +++ b/paddle/phi/api/ext/op_meta_info.h @@ -213,7 +213,7 @@ struct KernelFuncImpl { PD_SPECIALIZE_ComputeCallHelper(const std::vector&); // TODO(chenweihang): support other attribute type if needed. // Why not support other attribute type here? - // - boost::blank, std::vector and std::vector + // - paddle::blank, std::vector and std::vector // are not used in op // - BlockDesc* and std::vector are used in framework diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc index 0dd415d13130e..8074fbeb49180 100644 --- a/paddle/phi/core/enforce.cc +++ b/paddle/phi/core/enforce.cc @@ -14,13 +14,12 @@ limitations under the License. */ #include "paddle/phi/core/enforce.h" -#include #include #include #include #include -#include "boost/blank.hpp" +#include "paddle/utils/blank.h" #include "paddle/utils/variant.h" namespace egr { @@ -29,7 +28,7 @@ class EagerVariable; namespace paddle { namespace framework { class BlockDesc; -using Attribute = paddle::variant=(const blank&, const blank&) { return true; } + +inline bool operator!=(const blank&, const blank&) { return false; } + +inline bool operator<(const blank&, const blank&) { return false; } + +inline bool operator>(const blank&, const blank&) { return false; } + +} // namespace paddle From 01fedf4f7c684f2131f854de83e18d071073d8ff Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Mon, 4 Jul 2022 11:10:13 +0800 Subject: [PATCH 043/250] Remove boost::static_visitor (#44024) --- paddle/fluid/framework/dlpack_tensor.cc | 3 ++- paddle/fluid/framework/ir/generate_pass.cc | 4 ++-- paddle/fluid/framework/op_desc.cc | 2 +- paddle/fluid/framework/tensor_util.cc | 8 ++++---- .../fluid/imperative/gradient_accumulator.cc | 3 ++- .../allocation/naive_best_fit_allocator.cc | 8 ++++---- .../fluid/operators/array_to_lod_tensor_op.cc | 2 +- paddle/fluid/operators/controlflow/feed_op.cc | 2 +- .../fluid/operators/controlflow/op_variant.cc | 11 ++++------- .../fluid/operators/lod_tensor_to_array_op.cc | 3 ++- .../fluid/operators/math/matrix_bit_code.cc | 17 ++++++++--------- .../fluid/platform/device/ipu/ipu_compiler.cc | 4 ++-- paddle/fluid/platform/variant.h | 1 - .../pybind/global_value_getter_setter.cc | 2 +- paddle/fluid/pybind/pybind_boost_headers.h | 19 +------------------ paddle/phi/kernels/funcs/math_function.cc | 3 ++- 16 files changed, 37 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 057a19f31759b..b7bca733b8f9e 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -69,7 +69,8 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { #undef REG_DL_DATA_TYPE } -struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> { +struct DLDeviceVisitor + : public std::unary_function { inline ::DLDevice operator()(const platform::CPUPlace &place) const { ::DLDevice device; device.device_type = kDLCPU; diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc index 160304784a9fa..455af83427819 100644 --- a/paddle/fluid/framework/ir/generate_pass.cc +++ b/paddle/fluid/framework/ir/generate_pass.cc @@ -21,7 +21,7 @@ namespace paddle { namespace framework { namespace ir { -class element_visitor : public boost::static_visitor { +class element_visitor { public: explicit element_visitor(int index) : index_(index) {} @@ -47,7 +47,7 @@ class element_visitor : public boost::static_visitor { int index_; }; -class operation_visitor : public boost::static_visitor { +class operation_visitor { public: explicit operation_visitor(const proto::PassDesc::OperationType& type) : type_(type) {} diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 169722b971b29..c0a9528c28126 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -764,7 +764,7 @@ void OpDesc::RenameInput(const std::string &old_name, need_update_ = true; } -struct SetAttrDescVisitor : public boost::static_visitor { +struct SetAttrDescVisitor { explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {} mutable proto::OpDesc::Attr *attr_; void operator()(int v) const { attr_->set_i(v); } diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index dd80458b624c6..dbb549efa2519 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -689,7 +689,7 @@ inline void AnyImpl(Predicate predicate, } template -class AnyVisitor : public boost::static_visitor { +class AnyVisitor : public std::unary_function { private: const framework::Tensor& tensor_; Predicate predicate_; @@ -774,7 +774,7 @@ class AnyVisitor : public boost::static_visitor { }; template -class AnyOutVisitor : public boost::static_visitor<> { +class AnyOutVisitor : public std::unary_function { private: const framework::Tensor& tensor_; mutable framework::Tensor* out_; @@ -843,7 +843,7 @@ inline void AllImpl(Predicate predicate, } template -class AllOutVisitor : public boost::static_visitor<> { +class AllOutVisitor : public std::unary_function { private: const framework::Tensor& tensor_; mutable framework::Tensor* out_; @@ -942,7 +942,7 @@ static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) { } #endif -struct BothFalseVisitor : public boost::static_visitor<> { +struct BothFalseVisitor : public std::unary_function { const framework::Tensor& in_; mutable framework::Tensor* out_; BothFalseVisitor(const framework::Tensor& in, framework::Tensor* out) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 4a8fc6a5d546c..f6883fe6c6a92 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -79,7 +79,8 @@ static void MoveOrCopyVar(framework::Variable* dst, } template -class TensorAddFunctor : public boost::static_visitor<> { +class TensorAddFunctor + : public std::unary_function { public: TensorAddFunctor(int64_t numel, const T* x, T* y) : numel_(numel), x_(x), y_(y) {} diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 9d5f048a1651d..57c5941d5227d 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -59,7 +59,7 @@ uint64_t Release(const Place &place); template size_t Used(const Place &place); -struct Usage : public boost::static_visitor { +struct Usage { size_t operator()(const platform::CPUPlace &cpu) const; size_t operator()(const platform::CUDAPlace &gpu) const; size_t operator()(const platform::CUDAPinnedPlace &cuda_pinned) const; @@ -894,7 +894,7 @@ size_t Used(const platform::CustomPlace &place) { #endif } -struct AllocVisitor : public boost::static_visitor { +struct AllocVisitor : std::unary_function { inline explicit AllocVisitor(size_t size) : size_(size) {} template @@ -906,7 +906,7 @@ struct AllocVisitor : public boost::static_visitor { size_t size_; }; -struct FreeVisitor : public boost::static_visitor { +struct FreeVisitor : public std::unary_function { inline explicit FreeVisitor(void *ptr, size_t size) : ptr_(ptr), size_(size) {} @@ -920,7 +920,7 @@ struct FreeVisitor : public boost::static_visitor { size_t size_; }; -struct ReleaseVisitor : public boost::static_visitor { +struct ReleaseVisitor : std::unary_function { template inline uint64_t operator()(const Place &place) const { return Release(place); diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index 5b23ff604759a..a2af64e227680 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -43,7 +43,7 @@ struct ArrayToLoDFunctorImpl { void apply(); }; -struct ArrayToLoDFunctor : public boost::static_visitor { +struct ArrayToLoDFunctor : public std::unary_function { std::vector in; mutable framework::Tensor *out; diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index 00806d18c066f..4cef104496510 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -29,7 +29,7 @@ namespace operators { // FeedVariableVisitor is to feed the variable data // according to data type (LoDTensor or Strings). -class FeedVariableVisitor : public boost::static_visitor { +class FeedVariableVisitor { public: explicit FeedVariableVisitor(framework::Variable *out_var, const platform::Place &place) diff --git a/paddle/fluid/operators/controlflow/op_variant.cc b/paddle/fluid/operators/controlflow/op_variant.cc index 60f58955adbed..48b7a43410672 100644 --- a/paddle/fluid/operators/controlflow/op_variant.cc +++ b/paddle/fluid/operators/controlflow/op_variant.cc @@ -17,24 +17,21 @@ namespace paddle { namespace operators { -struct InputsVisitor - : public boost::static_visitor { +struct InputsVisitor { template const framework::VariableNameMap *operator()(const OpType *op) const { return &(op->Inputs()); } }; -struct OutputsVisitor - : public boost::static_visitor { +struct OutputsVisitor { template const framework::VariableNameMap *operator()(const OpType *op) const { return &(op->Outputs()); } }; -struct AttributeMapVisitor - : public boost::static_visitor { +struct AttributeMapVisitor { const framework::AttributeMap *operator()(const framework::OpDesc *op) const { return &(op->GetAttrMap()); } @@ -45,7 +42,7 @@ struct AttributeMapVisitor } }; -struct RawPointerVisitor : public boost::static_visitor { +struct RawPointerVisitor { template const void *operator()(const OpType *op) const { return op; diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index 147b23f56acdc..d4b36f31e6201 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -44,7 +44,8 @@ struct LoDTensorToArrayFunctorImpl { void apply(); }; -struct LoDTensorToArrayFunctor : public boost::static_visitor { +struct LoDTensorToArrayFunctor + : public std::unary_function { std::vector ref_inputs_; mutable std::vector outputs_; const framework::Tensor &input_; diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 8a6f098baefd9..0648f2497d9d7 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -19,7 +19,7 @@ namespace operators { namespace math { template -struct MatrixBitCodeFunctorAdd : public boost::static_visitor { +struct MatrixBitCodeFunctorAdd { const framework::Tensor &vec_; framework::Tensor *tmat_; @@ -51,7 +51,7 @@ void MatrixBitCodeFunctor::Add(const framework::Tensor &vec, } template -struct MatrixBitCodeFunctorAddGrad : public boost::static_visitor { +struct MatrixBitCodeFunctorAddGrad { const framework::Tensor &tmat_; framework::Tensor *vec_; MatrixBitCodeFunctorAddGrad(const framework::Tensor &tmat, @@ -83,7 +83,7 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, } template -struct MatrixBitCodeFunctorSum : public boost::static_visitor { +struct MatrixBitCodeFunctorSum { const framework::Tensor &tmat_; framework::Tensor *sum_; T scale_sum_; @@ -125,7 +125,7 @@ void MatrixBitCodeFunctor::Sum(const framework::Tensor &tmat, } template -struct MatrixBitCodeFunctorMul : public boost::static_visitor { +struct MatrixBitCodeFunctorMul { framework::Tensor *tmat_; const framework::Tensor &weight_; const framework::Tensor &input_; @@ -174,7 +174,7 @@ class ReservedVector : public std::vector { }; template -struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor { +struct MatrixBitCodeFunctorMulGradWeight { const framework::Tensor &tmat_; framework::Tensor *weight_; const framework::Tensor &input_; @@ -224,8 +224,7 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor &tmat, } template -struct MatrixBitCodeFunctorMulGradWeightSR - : public boost::static_visitor { +struct MatrixBitCodeFunctorMulGradWeightSR { const framework::Tensor &tmat_; phi::SelectedRows *weight_; const framework::Tensor &input_; @@ -280,7 +279,7 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor &tmat, } template -struct MatrixBitCodeFunctorMulGradError : public boost::static_visitor { +struct MatrixBitCodeFunctorMulGradError { const framework::Tensor &tmat_; const framework::Tensor &weight_; framework::Tensor *input_; @@ -324,7 +323,7 @@ void MatrixBitCodeFunctor::MulGradError(const framework::Tensor &tmat, } template -struct MatrixBitCodeFunctorSub : public boost::static_visitor { +struct MatrixBitCodeFunctorSub { framework::Tensor *tmat_; explicit MatrixBitCodeFunctorSub(framework::Tensor *tmat) : tmat_(tmat) {} diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index 39ff4601b6749..330ddef577ef2 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -32,7 +32,7 @@ namespace ipu { namespace { -struct CustomOpAttrVisitor : public boost::static_visitor { +struct CustomOpAttrVisitor { CustomOpAttrVisitor(std::map* attr, const std::string& attr_name) : attrs_(attr), attr_name_(attr_name) {} @@ -82,7 +82,7 @@ struct CustomOpAttrVisitor : public boost::static_visitor { } }; -struct ConstantOpAttrVisitor : public boost::static_visitor { +struct ConstantOpAttrVisitor { ConstantOpAttrVisitor(framework::LoDTensor* tensor, VarType::Type dtype) : tensor_(tensor), dtype_(dtype) {} diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h index fb4772abd3062..9682749898fc7 100644 --- a/paddle/fluid/platform/variant.h +++ b/paddle/fluid/platform/variant.h @@ -40,7 +40,6 @@ limitations under the License. */ #include #include -#include #include "paddle/utils/any.h" #include "paddle/utils/optional.h" diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index 2871d1de56780..b2a52e568aed9 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -217,7 +217,7 @@ void BindGlobalValueGetterSetter(pybind11::module *module) { GlobalVarGetterSetterRegistry::CreateSetter(&var)); \ } while (0) -struct RegisterGetterSetterVisitor : public boost::static_visitor { +struct RegisterGetterSetterVisitor { RegisterGetterSetterVisitor(const std::string &name, bool is_writable, void *value_ptr) diff --git a/paddle/fluid/pybind/pybind_boost_headers.h b/paddle/fluid/pybind/pybind_boost_headers.h index 623ec84acda6f..2a25990944d14 100644 --- a/paddle/fluid/pybind/pybind_boost_headers.h +++ b/paddle/fluid/pybind/pybind_boost_headers.h @@ -18,14 +18,12 @@ limitations under the License. */ #include #include "glog/logging.h" -#include "paddle/fluid/platform/variant.h" #include "paddle/utils/variant.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" // Cast paddle::variant for PyBind. // Copy from - // https://github.com/pybind/pybind11/issues/576#issuecomment-269563199 namespace pybind11 { namespace detail { @@ -78,10 +76,7 @@ struct paddle_variant_caster> { using Type = V; template - typename std::enable_if< - !std::is_same::value, - bool>::type - try_load(handle src, bool convert) { + bool try_load(handle src, bool convert) { auto caster = make_caster(); if (!load_success_ && caster.load(src, convert)) { load_success_ = true; @@ -112,13 +107,6 @@ struct paddle_variant_caster> { return false; } - template - typename std::enable_if::value, - bool>::type - try_load(handle src, bool convert) { - return false; - } - bool load(handle src, bool convert) { auto unused = {false, try_load(src, convert)...}; (void)(unused); @@ -128,11 +116,6 @@ struct paddle_variant_caster> { static handle cast(Type const& src, return_value_policy policy, handle parent) { - /* - auto paddle_variant_caster_visitor = [&](Type const& src)->handle { - return make_caster::cast(src, policy, parent); - } - */ paddle_variant_caster_visitor visitor(policy, parent); return paddle::visit(visitor, src); } diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc index 61cedb57faf27..15a708f02f497 100644 --- a/paddle/phi/kernels/funcs/math_function.cc +++ b/paddle/phi/kernels/funcs/math_function.cc @@ -214,7 +214,8 @@ void set_constant_with_place( phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value)); } -struct TensorSetConstantWithPlace : public boost::static_visitor { +struct TensorSetConstantWithPlace + : public std::unary_function { TensorSetConstantWithPlace(const paddle::platform::DeviceContext& context, paddle::framework::Tensor* tensor, float value) From a42f48bd5d4da5010dff57dca2456604a5cfa4b3 Mon Sep 17 00:00:00 2001 From: yaozhixin Date: Mon, 4 Jul 2022 11:21:35 +0800 Subject: [PATCH 044/250] update paddle inference fp16 mode (#44014) --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../ir/ipu/inference_dtype_transfer_pass.cc | 104 ++++++++++++++++++ .../ir/ipu/inference_dtype_transfer_pass.h | 30 +++++ .../ir/ipu/inference_process_pass.cc | 6 +- .../fluid/platform/device/ipu/ipu_compiler.cc | 10 +- .../fluid/platform/device/ipu/ipu_executor.cc | 2 + .../fluid/platform/device/ipu/ipu_strategy.cc | 1 + .../fluid/platform/device/ipu/ipu_strategy.h | 5 +- 8 files changed, 155 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc create mode 100644 paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index d19b163817e41..8569a3bb6151f 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -240,6 +240,7 @@ if(WITH_IPU) pass_library(infer_shape_pass base DIR ipu) pass_library(delete_scale_op_pass base DIR ipu) pass_library(avg_shard_pass base DIR ipu) + pass_library(inference_dtype_transfer_pass base DIR ipu) endif() cc_library( diff --git a/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc b/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc new file mode 100644 index 0000000000000..f06f05e9f0242 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.h" + +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/phi/common/place.h" + +namespace paddle { +namespace framework { +namespace ir { + +void InferenceDtypeTransferPass::ApplyImpl(ir::Graph* graph) const { + VLOG(10) << "enter InferenceDtypeTransferPass::ApplyImpl"; + VLOG(10) << "Raw Graph: "; + VLOG(10) << DebugString(graph); + + auto* ipu_backend = platform::ipu::IpuBackend::GetInstance(); + auto enable_fp16 = ipu_backend->GetIpuStrategy()->enable_fp16; + + if (enable_fp16) { + VLOG(10) << "Transfer var to fp16..."; + auto* scope = ipu_backend->GetScope(); + + std::unordered_set used_var_names; + for (auto* node : graph->Nodes()) { + if (node->IsVar()) { + auto var_desc = node->Var(); + if (var_desc->GetDataType() == proto::VarType::FP32) { + // Transfer the dtypes of var_desc + var_desc->SetDataType(proto::VarType::FP16); + VLOG(10) << "Transfer the VarDesc of " << var_desc->Name() << " to " + << var_desc->GetDataType(); + + if (node->inputs.empty() && node->Var()->Persistable() && + scope->FindVar(var_desc->Name()) && + used_var_names.find(var_desc->Name()) == used_var_names.end()) { + // Transfer the dtypes of weight tensors + std::vector fp16_data; + auto* tensor = scope->FindVar(var_desc->Name()) + ->GetMutable(); + auto* data_ptr = tensor->data(); + auto num_elem = tensor->numel(); + + std::transform(data_ptr, + data_ptr + num_elem, + std::back_inserter(fp16_data), + [&](float elem) { return float16(elem); }); + memcpy(reinterpret_cast(data_ptr), + fp16_data.data(), + num_elem * sizeof(float16)); + tensor->set_type( + framework::TransToPhiDataType(proto::VarType::FP16)); + } + } + used_var_names.insert(var_desc->Name()); + } + if (node->IsOp()) { + auto* op_desc = node->Op(); + if (op_desc->Type() == "popart_cast") { + // Transfer the target dtype of cast Op + if (BOOST_GET_CONST(std::string, op_desc->GetAttr("to")) == "FLOAT") { + op_desc->SetAttr("to", std::string("FLOAT16")); + op_desc->Flush(); + } + } + if (op_desc->Type() == "popart_constant") { + // Transfer the dtype of fill_constant Op + if (op_desc->GetAttrIfExists("dtype") == 1) { + op_desc->SetAttr("dtype", 10); + op_desc->Flush(); + } + } + } + } + VLOG(10) << "Transfer var to fp16...Done"; + } + + VLOG(10) << "Post Graph: "; + VLOG(10) << DebugString(graph); + VLOG(10) << "leave InferenceDtypeTransferPass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(inference_dtype_transfer_pass, + paddle::framework::ir::InferenceDtypeTransferPass); diff --git a/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.h b/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.h new file mode 100644 index 0000000000000..3111968ea2bba --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class InferenceDtypeTransferPass : public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc index 8357ec05c24f6..1ef03b1bd9cfb 100644 --- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc +++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc @@ -90,6 +90,9 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { ipu_strategy_instance_->available_memory_proportion = graph->Get("available_memory_proportion"); + // Set tiles_per_ipu for IPUMODEL + ipu_strategy_instance_->tiles_per_ipu = 128; + ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get())); // Get feed_list and fetch list @@ -124,7 +127,8 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { std::vector graph_pass = {"forward_graph_extract_pass", "infer_shape_pass", "avg_shard_pass", - "popart_canonicalization_pass"}; + "popart_canonicalization_pass", + "inference_dtype_transfer_pass"}; std::vector compile_pass = {"ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass", diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index 330ddef577ef2..74b216f4e0f58 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -111,7 +111,13 @@ struct ConstantOpAttrVisitor { framework::TensorFromVector(vec, tensor_); } void operator()(const std::vector& vec) const { - framework::TensorFromVector(vec, tensor_); + // popart do not support float64 constant + std::vector vec_fp32; + std::transform(vec.begin(), + vec.end(), + std::back_inserter(vec_fp32), + [](double f) -> float { return static_cast(f); }); + framework::TensorFromVector(vec_fp32, tensor_); } #define RAISE_ERROR \ PADDLE_THROW( \ @@ -416,7 +422,7 @@ void Compiler::LowerWeights(const Scope* scope) { auto* node = graph_helper_->nodes_id_map[id]; // Weights are var node and Persistable if (node->IsVar() && !node->IsCtrlVar() && node->Var() && - node->Var()->Persistable()) { + node->Var()->Persistable() && node->inputs.empty()) { // Weights are Parameter in training mode if (ipu_strategy_->is_training && !node->Var()->IsParameter()) { continue; diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc index 4db25e880f3a9..3cd4a12b378a3 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.cc +++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc @@ -257,6 +257,7 @@ void Executor::AcquireDevice() { "numIPUs", std::to_string(ipu_strategy_->num_ipus), }, + {"tilesPerIPU", std::to_string(ipu_strategy_->tiles_per_ipu)}, {"ipuVersion", "ipu2"}, }; device_ = popart::DeviceManager::createDeviceManager().createIpuModelDevice( @@ -269,6 +270,7 @@ void Executor::AcquireDevice() { "numIPUs", std::to_string(ipu_strategy_->num_ipus), }, + {"tilesPerIPU", std::to_string(ipu_strategy_->tiles_per_ipu)}, {"ipuVersion", "ipu2"}, }; device_ = diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc index eeffd0a36e015..e7d53c751f2b9 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -91,6 +91,7 @@ IpuStrategy::IpuStrategy() { ADD_UINT64_OPTION(batches_per_step); ADD_UINT64_OPTION(micro_batch_size); ADD_UINT64_OPTION(random_seed); + ADD_UINT64_OPTION(tiles_per_ipu); ADD_DOUBLE_OPTION(available_memory_proportion); ADD_DOUBLE_OPTION(loss_scaling); ADD_DOUBLE_OPTION(max_weight_norm); diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index 1fdde59cf856c..9ae54108ac528 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -41,7 +41,7 @@ class IpuStrategy { // Average sharding, debugging used bool need_avg_shard = false; - // Flag for fp16, true for pure fp16 + // Flag for fp16, true for inference with pure fp16 bool enable_fp16 = false; // The mode of Adam/Lamb optimizer @@ -64,6 +64,9 @@ class IpuStrategy { // Micro batch-size int micro_batch_size = 1; + // The number of virtual tiles for IPUMODEL + int tiles_per_ipu = 4; + // Random seed std::uint64_t random_seed = std::numeric_limits::max(); From cf8e86df35aecae07501e79be358a9ce48b84934 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Mon, 4 Jul 2022 13:48:42 +0800 Subject: [PATCH 045/250] [CINN] Enable test_resnet50_with_cinn (#44017) --- .../fluid/tests/unittests/CMakeLists.txt | 19 +++++++++++++++++++ .../unittests/test_resnet50_with_cinn.py | 4 ++++ 2 files changed, 23 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6df9c8c4269ca..06bec07d7acaf 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1687,3 +1687,22 @@ if($ENV{USE_STANDALONE_EXECUTOR}) set_tests_properties(test_imperative_mnist_sorted_gradient PROPERTIES ENVIRONMENT FLAGS_USE_STANDALONE_EXECUTOR=0) endif() + +if(WITH_CINN AND WITH_TESTING) + set_tests_properties( + test_resnet50_with_cinn + PROPERTIES + LABELS + "RUN_TYPE=CINN" + ENVIRONMENT + FLAGS_allow_cinn_ops="conv2d;conv2d_grad;elementwise_add;elementwise_add_grad;relu;relu_grad;sum" + ) + set_tests_properties( + test_parallel_executor_run_cinn + PROPERTIES + LABELS + "RUN_TYPE=CINN" + ENVIRONMENT + FLAGS_allow_cinn_ops="conv2d;conv2d_grad;elementwise_add;elementwise_add_grad;relu;relu_grad;sum" + ) +endif() diff --git a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py index 829960250d05d..4aebad4e87cb6 100644 --- a/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py +++ b/python/paddle/fluid/tests/unittests/test_resnet50_with_cinn.py @@ -108,6 +108,10 @@ def test_check_resnet50_accuracy(self): loss_c = self.train(place, loop_num, feed, use_cinn=True) loss_p = self.train(place, loop_num, feed, use_cinn=False) + print("Losses of CINN:") + print(loss_c) + print("Losses of Paddle") + print(loss_p) self.assertTrue(np.allclose(loss_c, loss_p, atol=1e-5)) From 9e3433bd9f0df966fe3a6c1c225ddda73e561653 Mon Sep 17 00:00:00 2001 From: Bo Zhang <105368690+zhangbopd@users.noreply.github.com> Date: Mon, 4 Jul 2022 15:44:02 +0800 Subject: [PATCH 046/250] Merge dimensions && OP performance optimization (#43931) --- paddle/phi/kernels/gpu/cross_grad_kernel.cu | 46 ++++++++++++------ paddle/phi/kernels/gpu/cross_kernel.cu | 52 ++++++++++++++------- 2 files changed, 67 insertions(+), 31 deletions(-) diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu index ada78adb77fc9..a6399ba39dcae 100644 --- a/paddle/phi/kernels/gpu/cross_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu @@ -22,8 +22,6 @@ namespace phi { -using funcs::IndexCalculator; - template __global__ void CrossGrad(const T* x, const T* y, @@ -32,7 +30,7 @@ __global__ void CrossGrad(const T* x, T* out_dy, const int stride, const int N, - IndexCalculator index_calculator) { + phi::funcs::IndexCalculator index_calculator) { CUDA_KERNEL_LOOP(i, N) { int offset = index_calculator(i); @@ -107,32 +105,52 @@ void CrossGradKernel(const Context& dev_ctx, std::vector cal_dims; std::vector left_strides; std::vector full_strides; + std::vector merged_dims; + + for (int i = 0; i < dim; i++) { + if (i == 0) { + merged_dims.push_back(input_x_dims[i]); + } else { + merged_dims[0] *= input_x_dims[i]; + } + } + int merge_axis = merged_dims.size(); + merged_dims.push_back(input_x_dims[dim]); + for (int i = dim + 1; i < input_x_dims.size(); i++) { + if (i == dim + 1) { + merged_dims.push_back(input_x_dims[i]); + } else { + merged_dims[merge_axis + 1] *= input_x_dims[i]; + } + } int full_dim = 1; - int left_dim = 1; - for (auto i = 0; i < input_x_dims.size(); i++) { + for (int i = 0; i < merged_dims.size(); i++) { full_strides.insert(full_strides.begin(), full_dim); - full_dim *= input_x_dims[input_x_dims.size() - i - 1]; - if (i == dim) { + full_dim *= merged_dims[merged_dims.size() - i - 1]; + if (i == merge_axis) { continue; } cal_dims.push_back(i); + } + int left_dim = 1; + for (int i = merged_dims.size() - 1; i >= 0; i--) { + if (i == merge_axis) { + continue; + } left_strides.insert(left_strides.begin(), left_dim); - left_dim *= input_x_dims[input_x_dims.size() - i - 1]; + left_dim *= merged_dims[i]; } const auto* input_x_data = input_x.data(); const auto* input_y_data = input_y.data(); const auto* input_out_grad_data = input_out_grad.data(); - auto* output_x_grad_data = dev_ctx.template Alloc(x_grad); auto* output_y_grad_data = dev_ctx.template Alloc(y_grad); - - auto index_calculator = IndexCalculator( - input_x_dims.size() - 1, cal_dims, left_strides, full_strides); + auto index_calculator = phi::funcs::IndexCalculator( + merged_dims.size() - 1, cal_dims, left_strides, full_strides); int64_t numel = x.numel(); - backends::gpu::GpuLaunchConfig config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel / 3); @@ -144,7 +162,7 @@ void CrossGradKernel(const Context& dev_ctx, input_out_grad_data, output_x_grad_data, output_y_grad_data, - full_strides[dim], + full_strides[merge_axis], numel / 3, index_calculator); } diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu index 44173f4fbe62d..0e1e7b3a42568 100644 --- a/paddle/phi/kernels/gpu/cross_kernel.cu +++ b/paddle/phi/kernels/gpu/cross_kernel.cu @@ -22,15 +22,13 @@ namespace phi { -using funcs::IndexCalculator; - template __global__ void Cross(const T* x, const T* y, T* out, const int stride, const int N, - IndexCalculator index_calculator) { + phi::funcs::IndexCalculator index_calculator) { CUDA_KERNEL_LOOP(i, N) { int offset = index_calculator(i); @@ -96,30 +94,50 @@ void CrossKernel(const Context& dev_ctx, std::vector cal_dims; std::vector left_strides; std::vector full_strides; + std::vector merged_dims; + + for (int i = 0; i < dim; i++) { + if (i == 0) { + merged_dims.push_back(input_x_dims[i]); + } else { + merged_dims[0] *= input_x_dims[i]; + } + } + int merge_axis = merged_dims.size(); + merged_dims.push_back(input_x_dims[dim]); + for (int i = dim + 1; i < input_x_dims.size(); i++) { + if (i == dim + 1) { + merged_dims.push_back(input_x_dims[i]); + } else { + merged_dims[merge_axis + 1] *= input_x_dims[i]; + } + } - int dims0 = 1; - int dims1 = 1; - for (auto i = 0; i < input_x_dims.size(); i++) { - full_strides.insert(full_strides.begin(), dims0); - dims0 *= input_x_dims[input_x_dims.size() - i - 1]; - if (i == dim) { + int full_dim = 1; + for (int i = 0; i < merged_dims.size(); i++) { + full_strides.insert(full_strides.begin(), full_dim); + full_dim *= merged_dims[merged_dims.size() - i - 1]; + if (i == merge_axis) { continue; } cal_dims.push_back(i); - left_strides.insert(left_strides.begin(), dims1); - dims1 *= input_x_dims[input_x_dims.size() - i - 1]; + } + int left_dim = 1; + for (int i = merged_dims.size() - 1; i >= 0; i--) { + if (i == merge_axis) { + continue; + } + left_strides.insert(left_strides.begin(), left_dim); + left_dim *= merged_dims[i]; } const auto* input_x_data = input_x.data(); const auto* input_y_data = input_y.data(); - auto* out_data = dev_ctx.template Alloc(out); - - auto index_calculator = IndexCalculator( - input_x_dims.size() - 1, cal_dims, left_strides, full_strides); + auto index_calculator = phi::funcs::IndexCalculator( + merged_dims.size() - 1, cal_dims, left_strides, full_strides); int64_t numel = x.numel(); - backends::gpu::GpuLaunchConfig config = backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel / 3); @@ -129,7 +147,7 @@ void CrossKernel(const Context& dev_ctx, dev_ctx.stream()>>>(input_x_data, input_y_data, out_data, - full_strides[dim], + full_strides[merge_axis], numel / 3, index_calculator); } From 2b0c22adae830de60ee85ca7c6130fbefb16c26e Mon Sep 17 00:00:00 2001 From: Leo Guo <58431564+ZibinGuo@users.noreply.github.com> Date: Mon, 4 Jul 2022 17:17:26 +0800 Subject: [PATCH 047/250] Modify the unittests of the conv2d_transpose, gaussian_random op. test=kunlun (#43961) --- .../xpu/test_conv2d_transpose_op_xpu.py | 301 +++++++++--------- .../xpu/test_gaussian_random_op_xpu.py | 294 ++++++++++++++++- 2 files changed, 429 insertions(+), 166 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py index 4204a73524d27..22bc8fef839b8 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py @@ -22,9 +22,11 @@ import paddle.fluid.core as core import paddle.fluid as fluid from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper import paddle import paddle.nn as nn -from paddle.fluid import Program, program_guard + +paddle.enable_static() def conv2dtranspose_forward_naive(input_, filter_, attrs): @@ -117,166 +119,159 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride): return out -class TestConv2DTransposeOp(XPUOpTest): - - def setUp(self): - # init as conv transpose - self.dtype = np.float32 - self.need_check_grad = True - self.is_test = False - self.use_cudnn = False - self.use_mkldnn = False - self.output_size = None - self.output_padding = [] - self.data_format = "NCHW" - self.pad = [0, 0] - self.padding_algorithm = "EXPLICIT" - self.init_op_type() - self.init_test_case() - self.__class__.op_type = "conv2d_transpose" - - input_ = np.random.random(self.input_size).astype(self.dtype) - filter_ = np.random.random(self.filter_size).astype(self.dtype) - - self.inputs = {'Input': input_, 'Filter': filter_} - self.attrs = { - 'strides': self.stride, - 'paddings': self.pad, - 'padding_algorithm': self.padding_algorithm, - 'groups': self.groups, - 'dilations': self.dilations, - 'use_cudnn': self.use_cudnn, - 'is_test': self.is_test, - 'use_mkldnn': self.use_mkldnn, - 'data_format': self.data_format - } - if self.output_size is not None: - self.attrs['output_size'] = self.output_size - - if len(self.output_padding) > 0: - self.attrs['output_padding'] = self.output_padding - - output = conv2dtranspose_forward_naive(input_, filter_, - self.attrs).astype(self.dtype) - - self.outputs = {'Output': output} - - def test_check_output(self): - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad_no_input(self): - if self.need_check_grad: - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['Filter'], +class XPUTestConv2DTransposeOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'conv2d_transpose' + self.use_dynamic_create_class = False + + class TestConv2DTransposeOp(XPUOpTest): + + def setUp(self): + # init as conv transpose + self.need_check_grad = True + self.is_test = False + self.use_cudnn = False + self.use_mkldnn = False + self.output_size = None + self.output_padding = [] + self.data_format = "NCHW" + self.pad = [0, 0] + self.padding_algorithm = "EXPLICIT" + self.init_op_type() + self.init_test_case() + self.__class__.op_type = "conv2d_transpose" + + input_ = np.random.random(self.input_size).astype(self.dtype) + filter_ = np.random.random(self.filter_size).astype(self.dtype) + + self.inputs = {'Input': input_, 'Filter': filter_} + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'padding_algorithm': self.padding_algorithm, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'is_test': self.is_test, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format + } + if self.output_size is not None: + self.attrs['output_size'] = self.output_size + + if len(self.output_padding) > 0: + self.attrs['output_padding'] = self.output_padding + + output = conv2dtranspose_forward_naive( + input_, filter_, self.attrs).astype(self.dtype) + + self.outputs = {'Output': output} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_no_input(self): + if self.need_check_grad: + self.check_grad_with_place(self.place, ['Filter'], 'Output', no_grad_set=set(['Input'])) - def test_check_grad_no_filter(self): - if self.need_check_grad: - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['Input'], + def test_check_grad_no_filter(self): + if self.need_check_grad: + self.check_grad_with_place(self.place, ['Input'], 'Output', no_grad_set=set(['Filter'])) - def test_check_grad(self): - if self.need_check_grad: - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, set(['Input', 'Filter']), + def test_check_grad(self): + if self.need_check_grad: + self.check_grad_with_place(self.place, set(['Input', 'Filter']), 'Output') - def init_test_case(self): - self.pad = [0, 0] - self.stride = [1, 1] - self.dilations = [1, 1] - self.groups = 1 - self.input_size = [2, 3, 5, 5] # NCHW - f_c = self.input_size[1] - self.filter_size = [f_c, 6, 3, 3] - - def init_op_type(self): - self.op_type = "conv2d_transpose" - - -class TestWithSymmetricPad(TestConv2DTransposeOp): - - def init_test_case(self): - self.pad = [1, 1] - self.stride = [1, 1] - self.dilations = [1, 1] - self.groups = 1 - self.input_size = [2, 3, 5, 5] # NCHW - f_c = self.input_size[1] - self.filter_size = [f_c, 6, 3, 3] - - -class TestWithAsymmetricPad(TestConv2DTransposeOp): - - def init_test_case(self): - self.pad = [1, 0, 1, 2] - self.stride = [1, 1] - self.dilations = [1, 1] - self.groups = 1 - self.input_size = [2, 3, 5, 5] # NCHW - f_c = self.input_size[1] - self.filter_size = [f_c, 6, 3, 3] - - -class TestWithSAMEPad(TestConv2DTransposeOp): - - def init_test_case(self): - self.stride = [2, 1] - self.dilations = [1, 2] - self.groups = 1 - self.input_size = [2, 3, 6, 5] # NCHW - f_c = self.input_size[1] - self.filter_size = [f_c, 6, 4, 3] - self.padding_algorithm = 'SAME' - - -class TestWithVALIDPad(TestConv2DTransposeOp): - - def init_test_case(self): - self.stride = [1, 1] - self.dilations = [1, 1] - self.groups = 1 - self.input_size = [2, 3, 5, 5] # NCHW - f_c = self.input_size[1] - self.filter_size = [f_c, 6, 3, 3] - self.padding_algorithm = 'VALID' - - -class TestWithGroups(TestConv2DTransposeOp): - - def init_test_case(self): - self.pad = [1, 1] - self.stride = [1, 1] - self.dilations = [1, 1] - self.groups = 2 - self.input_size = [2, 4, 5, 5] # NCHW - f_c = self.input_size[1] - self.filter_size = [f_c, 3, 3, 3] - - -class TestWithStride(TestConv2DTransposeOp): - - def init_test_case(self): - self.pad = [1, 1] - self.stride = [2, 2] - self.dilations = [1, 1] - self.groups = 1 - self.input_size = [2, 3, 5, 5] # NCHW - f_c = self.input_size[1] - self.filter_size = [f_c, 6, 3, 3] - + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + def init_op_type(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "conv2d_transpose" + + class TestWithSymmetricPad(TestConv2DTransposeOp): + + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + class TestWithAsymmetricPad(TestConv2DTransposeOp): + + def init_test_case(self): + self.pad = [1, 0, 1, 2] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + class TestWithSAMEPad(TestConv2DTransposeOp): + + def init_test_case(self): + self.stride = [2, 1] + self.dilations = [1, 2] + self.groups = 1 + self.input_size = [2, 3, 6, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 4, 3] + self.padding_algorithm = 'SAME' + + class TestWithVALIDPad(TestConv2DTransposeOp): + + def init_test_case(self): + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + self.padding_algorithm = 'VALID' + + class TestWithGroups(TestConv2DTransposeOp): + + def init_test_case(self): + self.pad = [1, 1] + self.stride = [1, 1] + self.dilations = [1, 1] + self.groups = 2 + self.input_size = [2, 4, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 3, 3, 3] + + class TestWithStride(TestConv2DTransposeOp): + + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.dilations = [1, 1] + self.groups = 1 + self.input_size = [2, 3, 5, 5] # NCHW + f_c = self.input_size[1] + self.filter_size = [f_c, 6, 3, 3] + + +support_types = get_xpu_op_support_types('conv2d_transpose') +for stype in support_types: + create_test_class(globals(), XPUTestConv2DTransposeOp, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py index 0a0a9bb3d365d..0b2470228b94a 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py @@ -21,25 +21,293 @@ import numpy as np import paddle import paddle.fluid as fluid -import paddle.fluid.core as core -from paddle.fluid.op import Operator -from paddle.fluid.executor import Executor -from op_test import OpTest -from test_gaussian_random_op import TestGaussianRandomOp +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper +import paddle paddle.enable_static() -class TestXPUGaussianRandomOp(TestGaussianRandomOp): +class XPUTestGaussianRandomOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'gaussian_random' + self.use_dynamic_create_class = False + + class TestGaussianRandomOp(XPUOpTest): + + def init(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = 'gaussian_random' + + def setUp(self): + self.init() + self.python_api = paddle.normal + self.set_attrs() + self.inputs = {} + self.use_mkldnn = False + self.attrs = { + "shape": [123, 92], + "mean": self.mean, + "std": self.std, + "seed": 10, + "use_mkldnn": self.use_mkldnn + } + paddle.seed(10) + + self.outputs = {'Out': np.zeros((123, 92), dtype=self.dtype)} + + def set_attrs(self): + self.mean = 1.0 + self.std = 2. + + def test_check_output(self): + self.check_output_with_place_customized(self.verify_output, + self.place) + + def verify_output(self, outs): + self.assertEqual(outs[0].shape, (123, 92)) + hist, _ = np.histogram(outs[0], range=(-3, 5)) + hist = hist.astype("float32") + hist /= float(outs[0].size) + data = np.random.normal(size=(123, 92), loc=1, scale=2) + hist2, _ = np.histogram(data, range=(-3, 5)) + hist2 = hist2.astype("float32") + hist2 /= float(outs[0].size) + self.assertTrue(np.allclose(hist, hist2, rtol=0, atol=0.01), + "hist: " + str(hist) + " hist2: " + str(hist2)) + + class TestMeanStdAreInt(TestGaussianRandomOp): + + def set_attrs(self): + self.mean = 1 + self.std = 2 + + # Situation 2: Attr(shape) is a list(with tensor) + class TestGaussianRandomOp_ShapeTensorList(TestGaussianRandomOp): + + def setUp(self): + '''Test gaussian_random op with specified value + ''' + self.init() + self.init_data() + shape_tensor_list = [] + for index, ele in enumerate(self.shape): + shape_tensor_list.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.attrs = { + 'shape': self.infer_shape, + 'mean': self.mean, + 'std': self.std, + 'seed': self.seed, + 'use_mkldnn': self.use_mkldnn + } + + self.inputs = {"ShapeTensorList": shape_tensor_list} + self.outputs = {'Out': np.zeros(self.shape, dtype=self.dtype)} + + def init_data(self): + self.shape = [123, 92] + self.infer_shape = [-1, 92] + self.use_mkldnn = False + self.mean = 1.0 + self.std = 2.0 + self.seed = 10 + + def test_check_output(self): + self.check_output_with_place_customized(self.verify_output, + self.place) + + class TestGaussianRandomOp2_ShapeTensorList( + TestGaussianRandomOp_ShapeTensorList): + + def init_data(self): + self.shape = [123, 92] + self.infer_shape = [-1, -1] + self.use_mkldnn = False + self.mean = 1.0 + self.std = 2.0 + self.seed = 10 + + class TestGaussianRandomOp3_ShapeTensorList( + TestGaussianRandomOp_ShapeTensorList): + + def init_data(self): + self.shape = [123, 92] + self.infer_shape = [123, -1] + self.use_mkldnn = True + self.mean = 1.0 + self.std = 2.0 + self.seed = 10 + + class TestGaussianRandomOp4_ShapeTensorList( + TestGaussianRandomOp_ShapeTensorList): + + def init_data(self): + self.shape = [123, 92] + self.infer_shape = [123, -1] + self.use_mkldnn = False + self.mean = 1.0 + self.std = 2.0 + self.seed = 10 + + # Situation 3: shape is a tensor + class TestGaussianRandomOp1_ShapeTensor(TestGaussianRandomOp): + + def setUp(self): + '''Test gaussian_random op with specified value + ''' + self.init() + self.init_data() + self.use_mkldnn = False + + self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")} + self.attrs = { + 'mean': self.mean, + 'std': self.std, + 'seed': self.seed, + 'use_mkldnn': self.use_mkldnn + } + self.outputs = {'Out': np.zeros((123, 92), dtype=self.dtype)} + + def init_data(self): + self.shape = [123, 92] + self.use_mkldnn = False + self.mean = 1.0 + self.std = 2.0 + self.seed = 10 + + +# Test python API +class TestGaussianRandomAPI(unittest.TestCase): + + def test_api(self): + positive_2_int32 = fluid.layers.fill_constant([1], "int32", 2000) + + positive_2_int64 = fluid.layers.fill_constant([1], "int64", 500) + shape_tensor_int32 = fluid.data(name="shape_tensor_int32", + shape=[2], + dtype="int32") + + shape_tensor_int64 = fluid.data(name="shape_tensor_int64", + shape=[2], + dtype="int64") + + out_1 = fluid.layers.gaussian_random(shape=[2000, 500], + dtype="float32", + mean=0.0, + std=1.0, + seed=10) + + out_2 = fluid.layers.gaussian_random(shape=[2000, positive_2_int32], + dtype="float32", + mean=0., + std=1.0, + seed=10) + + out_3 = fluid.layers.gaussian_random(shape=[2000, positive_2_int64], + dtype="float32", + mean=0., + std=1.0, + seed=10) + + out_4 = fluid.layers.gaussian_random(shape=shape_tensor_int32, + dtype="float32", + mean=0., + std=1.0, + seed=10) + + out_5 = fluid.layers.gaussian_random(shape=shape_tensor_int64, + dtype="float32", + mean=0., + std=1.0, + seed=10) + + out_6 = fluid.layers.gaussian_random(shape=shape_tensor_int64, + dtype=np.float32, + mean=0., + std=1.0, + seed=10) + + exe = fluid.Executor(place=fluid.XPUPlace(0)) + res_1, res_2, res_3, res_4, res_5, res_6 = exe.run( + fluid.default_main_program(), + feed={ + "shape_tensor_int32": np.array([2000, 500]).astype("int32"), + "shape_tensor_int64": np.array([2000, 500]).astype("int64"), + }, + fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6]) + + self.assertAlmostEqual(np.mean(res_1), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_1), 1., delta=0.1) + self.assertAlmostEqual(np.mean(res_2), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_2), 1., delta=0.1) + self.assertAlmostEqual(np.mean(res_3), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_3), 1., delta=0.1) + self.assertAlmostEqual(np.mean(res_4), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_5), 1., delta=0.1) + self.assertAlmostEqual(np.mean(res_5), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_5), 1., delta=0.1) + self.assertAlmostEqual(np.mean(res_6), 0.0, delta=0.1) + self.assertAlmostEqual(np.std(res_6), 1., delta=0.1) + + def test_default_dtype(self): + paddle.disable_static() + + def test_default_fp16(): + paddle.framework.set_default_dtype('float16') + paddle.tensor.random.gaussian([2, 3]) + + self.assertRaises(TypeError, test_default_fp16) + + def test_default_fp32(): + paddle.framework.set_default_dtype('float32') + out = paddle.tensor.random.gaussian([2, 3]) + self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32) + + def test_default_fp64(): + paddle.framework.set_default_dtype('float64') + out = paddle.tensor.random.gaussian([2, 3]) + self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64) + + test_default_fp64() + test_default_fp32() + + paddle.enable_static() + + +class TestStandardNormalDtype(unittest.TestCase): + + def test_default_dtype(self): + paddle.disable_static() + + def test_default_fp16(): + paddle.framework.set_default_dtype('float16') + paddle.tensor.random.standard_normal([2, 3]) + + self.assertRaises(TypeError, test_default_fp16) + + def test_default_fp32(): + paddle.framework.set_default_dtype('float32') + out = paddle.tensor.random.standard_normal([2, 3]) + self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32) + + def test_default_fp64(): + paddle.framework.set_default_dtype('float64') + out = paddle.tensor.random.standard_normal([2, 3]) + self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64) + + test_default_fp64() + test_default_fp32() + + paddle.enable_static() - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - outs = self.calc_output(place) - outs = [np.array(out) for out in outs] - outs.sort(key=len) - self.verify_output(outs) +support_types = get_xpu_op_support_types('gaussian_random') +for stype in support_types: + create_test_class(globals(), XPUTestGaussianRandomOp, stype) if __name__ == "__main__": unittest.main() From 8f8a68485543ad735e0ab212283264d8eaa50898 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 4 Jul 2022 06:54:10 -0500 Subject: [PATCH 048/250] unify cpu context (#44049) --- .../eager_generated/backwards/scale_node.cc | 18 +++++++++--------- paddle/fluid/platform/device_context.h | 1 - 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 4ee33ad100f16..1409119daf1d3 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -117,20 +117,20 @@ void ScaleAPI(const paddle::experimental::Tensor& x, paddle::platform::DeviceContextPool::Instance(); if (expected_kernel_place == paddle::platform::CPUPlace()) { - auto* dev_ctx = dynamic_cast( - pool.Get(expected_kernel_place)); + auto* dev_ctx = + dynamic_cast(pool.Get(expected_kernel_place)); if (!dev_ctx) { PADDLE_THROW(paddle::platform::errors::Fatal( - "Cannot convert device_context to CPUDeviceContext." + "Cannot convert device_context to phi::CPUContext." "This indicates backend mismatch." "Pleas double check your expected place")); } - ScaleDeviceDispatch(*dense_tensor.get(), - *dev_ctx, - scale, - bias, - bias_after_scale, - dense_out.get()); + ScaleDeviceDispatch(*dense_tensor.get(), + *dev_ctx, + scale, + bias, + bias_after_scale, + dense_out.get()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (expected_kernel_place == paddle::platform::CUDAPlace()) { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 1b7aafdac6f29..4459c913f005d 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -133,7 +133,6 @@ constexpr DeviceType kIPU = DeviceType::IPU; constexpr DeviceType kMLU = DeviceType::MLU; using DeviceContext = phi::DeviceContext; -using CPUDeviceContext = phi::CPUContext; template struct DefaultDeviceContextType; From bd06a828ee3bc564946932df44fc0c3699300c21 Mon Sep 17 00:00:00 2001 From: zhaoying9105 Date: Mon, 4 Jul 2022 20:16:24 +0800 Subject: [PATCH 049/250] [MLU]: add hard_sigmoid,hard_sigmoid_grad,hard_swish,hard_swish_grad kernel (#44044) --- paddle/fluid/operators/activation_op_mlu.cc | 160 +++++++++++++++ .../unittests/mlu/test_hard_sigmoid_op_mlu.py | 194 ++++++++++++++++++ .../unittests/mlu/test_hard_swish_op_mlu.py | 165 +++++++++++++++ 3 files changed, 519 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc index e19ce87e7c8ec..6ba86351e6af5 100644 --- a/paddle/fluid/operators/activation_op_mlu.cc +++ b/paddle/fluid/operators/activation_op_mlu.cc @@ -256,6 +256,149 @@ class ExpGradMLUKernel : public framework::OpKernel { } }; +template +class HardSwishMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + output->mutable_data(ctx.GetPlace()); + float threshold = ctx.Attr("threshold"); + float scale = ctx.Attr("scale"); + float offset = ctx.Attr("offset"); + PADDLE_ENFORCE_EQ(threshold, + 6.0f, + platform::errors::External( + "Not support threshold [%f] in MLU", threshold)); + PADDLE_ENFORCE_EQ( + scale, + 6.0f, + platform::errors::External("Not support scale [%f] in MLU", scale)); + PADDLE_ENFORCE_EQ( + offset, + 3.0f, + platform::errors::External("Not support offset [%f] in MLU", offset)); + + MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSWISH, + 1.0f /*ceof useless*/); + MLUCnnlTensorDesc input_desc(*input); + MLUCnnlTensorDesc output_desc(*output); + + MLUCnnl::Active(ctx, + act_desc.get(), + input_desc.get(), + GetBasePtr(input), + output_desc.get(), + GetBasePtr(output)); + } +}; + +template +class HardSwishGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + float threshold = ctx.Attr("threshold"); + float scale = ctx.Attr("scale"); + float offset = ctx.Attr("offset"); + PADDLE_ENFORCE_EQ(threshold, + 6.0f, + platform::errors::External( + "Not support threshold [%f] in MLU", threshold)); + PADDLE_ENFORCE_EQ( + scale, + 6.0f, + platform::errors::External("Not support scale [%f] in MLU", scale)); + PADDLE_ENFORCE_EQ( + offset, + 3.0f, + platform::errors::External("Not support offset [%f] in MLU", offset)); + auto* out = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + + dx->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc out_desc(*out); + MLUCnnlTensorDesc dout_desc(*dout); + MLUCnnlTensorDesc dx_desc(*dx); + MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSWISH, + 1.0f /*ceof useless*/); + MLUCnnl::ActiveGrad(ctx, + act_desc.get(), + nullptr, + nullptr, + nullptr, + nullptr, + dout_desc.get(), + GetBasePtr(dout), + out_desc.get(), + GetBasePtr(out), + dx_desc.get(), + GetBasePtr(dx)); + } +}; + +template +class HardSigmoidMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + float slope = ctx.Attr("slope"); + float offset = ctx.Attr("offset"); + output->mutable_data(ctx.GetPlace()); + + MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSIGMOID, + 1.0f /*ceof useless*/, + 1.0f /*sliced_dim useless*/, + slope, + offset); + MLUCnnlTensorDesc input_desc(*input); + MLUCnnlTensorDesc output_desc(*output); + + MLUCnnl::Active(ctx, + act_desc.get(), + input_desc.get(), + GetBasePtr(input), + output_desc.get(), + GetBasePtr(output)); + } +}; + +template +class HardSigmoidGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* out = ctx.Input("Out"); + auto* dx = ctx.Output(framework::GradVarName("X")); + float slope = ctx.Attr("slope"); + float offset = ctx.Attr("offset"); + dx->mutable_data(ctx.GetPlace()); + + MLUCnnlActivationDesc act_desc(CNNL_ACTIVATION_HARDSIGMOID, + 1.0f /*ceof useless*/, + 1.0f /*sliced_dim useless*/, + slope, + offset); + MLUCnnlTensorDesc out_desc(*out); + MLUCnnlTensorDesc dout_desc(*dout); + MLUCnnlTensorDesc dx_desc(*dx); + MLUCnnl::ActiveGrad(ctx, + act_desc.get(), + nullptr, + nullptr, + nullptr, + nullptr, + dout_desc.get(), + GetBasePtr(dout), + out_desc.get(), + GetBasePtr(out), + dx_desc.get(), + GetBasePtr(dx)); + } +}; + } // namespace operators } // namespace paddle @@ -359,3 +502,20 @@ REGISTER_OP_MLU_KERNEL(exp, REGISTER_OP_MLU_KERNEL(exp_grad, ops::ExpGradMLUKernel, ops::ExpGradMLUKernel); + +REGISTER_OP_MLU_KERNEL(hard_swish, + ops::HardSwishMLUKernel, + ops::HardSwishMLUKernel); + +REGISTER_OP_MLU_KERNEL(hard_swish_grad, + ops::HardSwishGradMLUKernel, + ops::HardSwishGradMLUKernel); + +REGISTER_OP_MLU_KERNEL(hard_sigmoid, + ops::HardSigmoidMLUKernel, + ops::HardSigmoidMLUKernel); + +REGISTER_OP_MLU_KERNEL( + hard_sigmoid_grad, + ops::HardSigmoidGradMLUKernel, + ops::HardSigmoidGradMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py new file mode 100644 index 0000000000000..a38c12c900470 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py @@ -0,0 +1,194 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys + +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.nn.functional as F + +paddle.enable_static() +SEED = 2021 + + +def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5): + return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype) + + +class TestMLUHardSigmoid(OpTest): + + def setUp(self): + paddle.enable_static() + + self.op_type = "hard_sigmoid" + self.set_mlu() + self.init_dtype() + self.set_attrs() + + x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype) + lower_threshold = -self.offset / self.slope + upper_threshold = (1. - self.offset) / self.slope + + # Same reason as TestAbs + delta = 0.005 + x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02 + x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02 + + out = ref_hardsigmoid(x, self.slope, self.offset) + + self.attrs = {'slope': self.slope, 'offset': self.offset} + self.inputs = {'X': x} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def set_attrs(self): + self.slope = 0.166666666666667 + self.offset = 0.5 + + +class TestMLUHardSigmoid2(TestMLUHardSigmoid): + + def set_attrs(self): + self.slope = 0.2 + self.offset = 0.5 + + +class TestMLUHardSigmoid3(TestMLUHardSigmoid): + + def set_attrs(self): + self.slope = 0.2 + self.offset = 0.4 + + +class TestMLUHardSigmoidFp16(unittest.TestCase): + + def setUp(self): + paddle.disable_static() + + self.place = paddle.MLUPlace(0) + self.dtype = np.float32 + + # float32 + self.float32_x = np.random.uniform(-5, 5, [10, 12]).astype(np.float32) + paddle.set_device('cpu') + data = paddle.to_tensor(self.float32_x, stop_gradient=True) + self.float32_y = F.hardsigmoid(data) + + # float16 + self.float16_x = self.float32_x.astype(np.float16) + self.float16_y = ref_hardsigmoid(self.float16_x) + + def test_check_output_and_grad_mlu(self): + # mlu float16 + paddle.set_device('mlu') + data = paddle.to_tensor(self.float16_x, stop_gradient=True) + mlu_float16_y = F.hardsigmoid(data) + + cpu_diff_1 = np.divide( + np.sum(np.abs(self.float32_y.numpy() - self.float16_y)), + np.sum(np.abs(self.float32_y.numpy()))) + mlu_diff_1 = np.divide( + np.sum(np.abs(self.float32_y.numpy() - mlu_float16_y.numpy())), + np.sum(np.abs(self.float32_y.numpy()))) + + cpu_diff_2 = np.divide( + np.sum(np.square(self.float32_y.numpy() - self.float16_y)), + np.sum(np.square(self.float32_y.numpy()))) + mlu_diff_2 = np.divide( + np.sum(np.square(self.float32_y.numpy() - mlu_float16_y.numpy())), + np.sum(np.square(self.float32_y.numpy()))) + assert mlu_diff_1 <= cpu_diff_1 + assert mlu_diff_2 <= cpu_diff_2 + + +class TestHardsigmoidAPI(unittest.TestCase): + # test paddle.nn.Hardsigmoid, paddle.nn.functional.hardsigmoid + def setUp(self): + self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float32) + self.place = paddle.MLUPlace(0) + + def test_static_api(self): + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data('X', self.x_np.shape, self.x_np.dtype) + out1 = F.hardsigmoid(x) + m = paddle.nn.Hardsigmoid() + out2 = m(x) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2]) + out_ref = ref_hardsigmoid(self.x_np) + for r in res: + self.assertTrue(np.allclose(out_ref, r)) + + def test_dygraph_api(self): + paddle.disable_static(self.place) + x = paddle.to_tensor(self.x_np) + out1 = F.hardsigmoid(x) + m = paddle.nn.Hardsigmoid() + out2 = m(x) + out_ref = ref_hardsigmoid(self.x_np) + for r in [out1, out2]: + self.assertTrue(np.allclose(out_ref, r.numpy())) + paddle.enable_static() + + def test_fluid_api(self): + with fluid.program_guard(fluid.Program()): + x = fluid.data('X', self.x_np.shape, self.x_np.dtype) + out = fluid.layers.hard_sigmoid(x) + exe = fluid.Executor(self.place) + res = exe.run(feed={'X': self.x_np}, fetch_list=[out]) + out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5) + self.assertTrue(np.allclose(out_ref, res[0])) + + paddle.disable_static(self.place) + x = paddle.to_tensor(self.x_np) + out = paddle.fluid.layers.hard_sigmoid(x) + self.assertTrue(np.allclose(out_ref, out.numpy())) + paddle.enable_static() + + def test_errors(self): + with paddle.static.program_guard(paddle.static.Program()): + # The input type must be Variable. + self.assertRaises(TypeError, F.hardsigmoid, 1) + # The input dtype must be float16, float32, float64. + x_int32 = paddle.fluid.data(name='x_int32', + shape=[12, 10], + dtype='int32') + self.assertRaises(TypeError, F.hardsigmoid, x_int32) + # support the input dtype is float16 + x_fp16 = paddle.fluid.data(name='x_fp16', + shape=[12, 10], + dtype='float16') + F.hardsigmoid(x_fp16) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py new file mode 100644 index 0000000000000..e0ae182b41d19 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py @@ -0,0 +1,165 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import paddle.nn.functional as F +import paddle.fluid as fluid +import paddle +from op_test import OpTest + +import numpy as np +import unittest +import sys + +sys.path.append("..") + +paddle.enable_static() +SEED = 2020 + + +def scalarToType(val, data_type): + converted_val = np.array([val]).astype(data_type)[0] + print("converted_val type: ", type(converted_val)) + return converted_val + + +def ref_hard_swish_grad(x, threshold, scale, offset, data_type): + threshold = scalarToType(threshold, data_type) + scale = scalarToType(scale, data_type) + offset = scalarToType(offset, data_type) + dout = np.full_like(x, fill_value=1. / x.size) + tmp = ((x + offset) < threshold).astype(x.dtype) + dx = dout * (((x + offset) > 0).astype(x.dtype) * + (2 * x + offset) * tmp / scale + 1.0 - tmp) + return dx + + +class TestHardSwishMLU(OpTest): + + def setUp(self): + paddle.enable_static() + + self.op_type = "hard_swish" + self.place = paddle.MLUPlace(0) + self.init_dtype() + + x = np.random.uniform(-2, 2, [10, 12]).astype(self.dtype) + threshold = 6.0 + scale = 6.0 + offset = 3.0 + + x[np.abs(x + offset) < 0.005] = 0.02 + x[np.abs(x - threshold + offset) < 0.005] = threshold - offset + 0.02 + + out = ( + x * + (np.minimum(np.maximum(x + offset, 0.), threshold) / scale)).astype( + self.dtype) + self.x_grad = ref_hard_swish_grad(x, threshold, scale, offset, + self.dtype) + self.set_mlu() + self.inputs = {'X': x} + self.attrs = {'threshold': threshold, 'scale': scale, 'offset': offset} + self.outputs = {'Out': out} + + def set_mlu(self): + self.__class__.use_mlu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + +class TestHardSwishMLUWithCPUFloat16(unittest.TestCase): + + def setUp(self): + paddle.disable_static() + + self.place = paddle.MLUPlace(0) + self.dtype = np.float32 + + # float32 + self.float32_x = np.random.uniform(-6, 10, [8, 15]).astype(np.float32) + paddle.set_device('cpu') + data = paddle.to_tensor(self.float32_x, stop_gradient=False) + self.float32_y = F.hardswish(data) + self.float32_y.sum().backward() + self.float32_grad = data.grad + + # float16 + self.float16_x = self.float32_x.astype(np.float16) + threshold = 6.0 + scale = 6.0 + offset = 3.0 + + threshold = scalarToType(threshold, np.float16) + scale = scalarToType(scale, np.float16) + offset = scalarToType(offset, np.float16) + self.float16_y = (self.float16_x * (np.minimum( + np.maximum(self.float16_x + offset, scalarToType(0., np.float16)), + threshold) / scale)).astype(np.float16) + self.float16_grad = ref_hard_swish_grad(self.float16_x, threshold, + scale, offset, np.float16) + + def test_check_output_and_grad_mlu(self): + # mlu float16 + paddle.set_device('mlu') + data = paddle.to_tensor(self.float16_x, stop_gradient=False) + mlu_float16_y = F.hardswish(data) + mlu_float16_y.sum().backward() + mlu_float16_grad = data.grad + + cpu_diff_1 = np.divide( + np.sum(np.abs(self.float32_y.numpy() - self.float16_y)), + np.sum(np.abs(self.float32_y.numpy()))) + mlu_diff_1 = np.divide( + np.sum(np.abs(self.float32_y.numpy() - mlu_float16_y.numpy())), + np.sum(np.abs(self.float32_y.numpy()))) + + cpu_diff_2 = np.divide( + np.sum(np.square(self.float32_y.numpy() - self.float16_y)), + np.sum(np.square(self.float32_y.numpy()))) + mlu_diff_2 = np.divide( + np.sum(np.square(self.float32_y.numpy() - mlu_float16_y.numpy())), + np.sum(np.square(self.float32_y.numpy()))) + assert mlu_diff_1 <= cpu_diff_1 + assert mlu_diff_2 <= cpu_diff_2 + + cpu_diff_1 = np.divide( + np.sum(np.abs(self.float32_grad.numpy() - self.float16_grad)), + np.sum(np.abs(self.float32_grad.numpy()))) + mlu_diff_1 = np.divide( + np.sum(np.abs(self.float32_grad.numpy() - + mlu_float16_grad.numpy())), + np.sum(np.abs(self.float32_grad.numpy()))) + + cpu_diff_2 = np.divide( + np.sum(np.square(self.float32_grad.numpy() - self.float16_grad)), + np.sum(np.square(self.float32_grad.numpy()))) + mlu_diff_2 = np.divide( + np.sum( + np.square(self.float32_grad.numpy() - + mlu_float16_grad.numpy())), + np.sum(np.square(self.float32_grad.numpy()))) + assert mlu_diff_1 <= cpu_diff_1 + assert mlu_diff_2 <= cpu_diff_2 + + +if __name__ == '__main__': + unittest.main() From 91c0f727d7e0cef4c9a18534b274200b8bddb5ea Mon Sep 17 00:00:00 2001 From: Chenxiao Niu Date: Mon, 4 Jul 2022 20:16:57 +0800 Subject: [PATCH 050/250] [MLU] uncomment some interp_v2 tests. (#44053) --- .../mlu/test_bilinear_interp_v2_op_mlu.py | 94 ++++++++------- .../mlu/test_nearest_interp_v2_op_mlu.py | 111 ++++++++++-------- 2 files changed, 110 insertions(+), 95 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py index b8c31578099e1..9806a4f74307d 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py @@ -515,51 +515,55 @@ def init_test_case(self): self.scale_by_1Dtensor = True -#TODO: comment this test for now until bilinear_interp_op added. -# class TestBilinearInterpOpAPI(unittest.TestCase): -# def test_case(self): -# x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32") - -# dim = fluid.data(name="dim", shape=[1], dtype="int32") -# shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32") -# actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32") -# scale_tensor = fluid.data( -# name="scale_tensor", shape=[1], dtype="float32") - -# out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12]) -# out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim]) -# out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor) -# out4 = fluid.layers.resize_bilinear( -# x, out_shape=[4, 4], actual_shape=actual_size) -# out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor) - -# x_data = np.random.random((2, 3, 6, 6)).astype("float32") -# dim_data = np.array([12]).astype("int32") -# shape_data = np.array([12, 12]).astype("int32") -# actual_size_data = np.array([12, 12]).astype("int32") -# scale_data = np.array([2.0]).astype("float32") - -# if core.is_compiled_with_mlu(): -# place = paddle.device.MLUPlace(0) -# else: -# place = core.CPUPlace() -# exe = fluid.Executor(place) -# exe.run(fluid.default_startup_program()) -# results = exe.run(fluid.default_main_program(), -# feed={ -# "x": x_data, -# "dim": dim_data, -# "shape_tensor": shape_data, -# "actual_size": actual_size_data, -# "scale_tensor": scale_data -# }, -# fetch_list=[out1, out2, out3, out4, out5], -# return_numpy=True) - -# expect_res = bilinear_interp_np( -# x_data, out_h=12, out_w=12, align_corners=True) -# for res in results: -# self.assertTrue(np.allclose(res, expect_res)) +class TestBilinearInterpOpAPI(unittest.TestCase): + + def test_case(self): + x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32") + + dim = fluid.data(name="dim", shape=[1], dtype="int32") + shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32") + actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32") + scale_tensor = fluid.data(name="scale_tensor", + shape=[1], + dtype="float32") + + out1 = fluid.layers.resize_bilinear(x, out_shape=[12, 12]) + out2 = fluid.layers.resize_bilinear(x, out_shape=[12, dim]) + out3 = fluid.layers.resize_bilinear(x, out_shape=shape_tensor) + out4 = fluid.layers.resize_bilinear(x, + out_shape=[4, 4], + actual_shape=actual_size) + out5 = fluid.layers.resize_bilinear(x, scale=scale_tensor) + + x_data = np.random.random((2, 3, 6, 6)).astype("float32") + dim_data = np.array([12]).astype("int32") + shape_data = np.array([12, 12]).astype("int32") + actual_size_data = np.array([12, 12]).astype("int32") + scale_data = np.array([2.0]).astype("float32") + + if core.is_compiled_with_mlu(): + place = paddle.device.MLUPlace(0) + else: + place = core.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + results = exe.run(fluid.default_main_program(), + feed={ + "x": x_data, + "dim": dim_data, + "shape_tensor": shape_data, + "actual_size": actual_size_data, + "scale_tensor": scale_data + }, + fetch_list=[out1, out2, out3, out4, out5], + return_numpy=True) + + expect_res = bilinear_interp_np(x_data, + out_h=12, + out_w=12, + align_corners=True) + for res in results: + self.assertTrue(np.allclose(res, expect_res)) class TestBilinearInterpOpAPI_dy(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py index e9235e62a7989..59078a21d0fa8 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_nearest_interp_v2_op_mlu.py @@ -274,6 +274,7 @@ def init_test_case(self): self.align_corners = True +# comment out since 5-D input not supported now # class TestNearestNeighborInterpCase1(TestNearestInterpOp): # def init_test_case(self): # self.interp_method = 'nearest' @@ -537,56 +538,66 @@ def init_test_case(self): self.scale_by_1Dtensor = True -#TODO: comment this test for now until nearest_interp_op added. -# class TestNearestAPI(unittest.TestCase): -# def test_case(self): -# x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32") -# y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32") - -# dim = fluid.data(name="dim", shape=[1], dtype="int32") -# shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32") -# actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32") -# scale_tensor = fluid.data( -# name="scale_tensor", shape=[1], dtype="float32") - -# out1 = fluid.layers.resize_nearest( -# y, out_shape=[12, 12], data_format='NHWC', align_corners=False) -# out2 = fluid.layers.resize_nearest( -# x, out_shape=[12, dim], align_corners=False) -# out3 = fluid.layers.resize_nearest( -# x, out_shape=shape_tensor, align_corners=False) -# out4 = fluid.layers.resize_nearest( -# x, out_shape=[4, 4], actual_shape=actual_size, align_corners=False) -# out5 = fluid.layers.resize_nearest( -# x, scale=scale_tensor, align_corners=False) - -# x_data = np.random.random((2, 3, 6, 6)).astype("float32") -# dim_data = np.array([12]).astype("int32") -# shape_data = np.array([12, 12]).astype("int32") -# actual_size_data = np.array([12, 12]).astype("int32") -# scale_data = np.array([2.0]).astype("float32") - -# place = paddle.MLUPlace(0) -# exe = fluid.Executor(place) -# exe.run(fluid.default_startup_program()) -# results = exe.run(fluid.default_main_program(), -# feed={ -# "x": x_data, -# "y": np.transpose(x_data, (0, 2, 3, 1)), -# "dim": dim_data, -# "shape_tensor": shape_data, -# "actual_size": actual_size_data, -# "scale_tensor": scale_data -# }, -# fetch_list=[out1, out2, out3, out4, out5], -# return_numpy=True) - -# expect_res = nearest_neighbor_interp_np( -# x_data, out_h=12, out_w=12, align_corners=False) -# self.assertTrue( -# np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1)))) -# for i in range(len(results) - 1): -# self.assertTrue(np.allclose(results[i + 1], expect_res)) +class TestNearestAPI(unittest.TestCase): + + def test_case(self): + x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32") + y = fluid.data(name="y", shape=[2, 6, 6, 3], dtype="float32") + + dim = fluid.data(name="dim", shape=[1], dtype="int32") + shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32") + actual_size = fluid.data(name="actual_size", shape=[2], dtype="int32") + scale_tensor = fluid.data(name="scale_tensor", + shape=[1], + dtype="float32") + + out1 = fluid.layers.resize_nearest(y, + out_shape=[12, 12], + data_format='NHWC', + align_corners=False) + out2 = fluid.layers.resize_nearest(x, + out_shape=[12, dim], + align_corners=False) + out3 = fluid.layers.resize_nearest(x, + out_shape=shape_tensor, + align_corners=False) + out4 = fluid.layers.resize_nearest(x, + out_shape=[4, 4], + actual_shape=actual_size, + align_corners=False) + out5 = fluid.layers.resize_nearest(x, + scale=scale_tensor, + align_corners=False) + + x_data = np.random.random((2, 3, 6, 6)).astype("float32") + dim_data = np.array([12]).astype("int32") + shape_data = np.array([12, 12]).astype("int32") + actual_size_data = np.array([12, 12]).astype("int32") + scale_data = np.array([2.0]).astype("float32") + + place = paddle.MLUPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + results = exe.run(fluid.default_main_program(), + feed={ + "x": x_data, + "y": np.transpose(x_data, (0, 2, 3, 1)), + "dim": dim_data, + "shape_tensor": shape_data, + "actual_size": actual_size_data, + "scale_tensor": scale_data + }, + fetch_list=[out1, out2, out3, out4, out5], + return_numpy=True) + + expect_res = nearest_neighbor_interp_np(x_data, + out_h=12, + out_w=12, + align_corners=False) + self.assertTrue( + np.allclose(results[0], np.transpose(expect_res, (0, 2, 3, 1)))) + for i in range(len(results) - 1): + self.assertTrue(np.allclose(results[i + 1], expect_res)) class TestNearestInterpException(unittest.TestCase): From faaa95ca62942dbb374636404426fac13745ee71 Mon Sep 17 00:00:00 2001 From: yaozhixin Date: Mon, 4 Jul 2022 20:18:15 +0800 Subject: [PATCH 051/250] update paddle.distributed.launch en doc (#44016) --- python/paddle/distributed/launch/main.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py index 4c1b99df178ea..fccb352c2a3cd 100644 --- a/python/paddle/distributed/launch/main.py +++ b/python/paddle/distributed/launch/main.py @@ -98,11 +98,11 @@ def launch(): The ``training_script_args`` includes arguments required by IPU distributed launch and illustrated as below. ``Examples 10`` has provided a example of paddle.distributed.launch with IPUs. - - ``--hosts``: The hosts for IPU distributd training. + - ``--hosts``: The hosts for IPU distributd training. Each host is able to include multiple processes. - - ``--nproc_per_host``: The number of processes launched per host. + - ``--nproc_per_host``: The number of processes launched per host. Each process is able to include multiple replicas. - - ``--ipus_per_replica``: The number of IPUs requested per replica. + - ``--ipus_per_replica``: The number of IPUs requested per replica. Each replica is able to include multiple IPUs. - ``--ipu_partition``: The partition name of IPU devices. @@ -110,7 +110,7 @@ def launch(): - ``training_script``: The full path to the IPU distributed training program/script to be launched in parallel. e.g., ``training.py``. - - ``training_script_args``: The args of the IPU distributed training program/script. + - ``training_script_args``: The args of the IPU distributed training program/script. e.g., ``--lr=0.1``. Returns: - ``None`` @@ -253,9 +253,11 @@ def launch(): .. code-block:: bash :name: code-block-example-bash10 - # With the following command, the job will begin to run the distributhed program with IPUs. - # Only support and require the `device_num` as the arg and `ipu` as the launch script. - # Please Check the details about the following args of the launch scripte from `utils/ipu_launch.py`. + # With the following command, the job will begin to run the distributhed program with IPUs + # Require `devices` as the number of IPUs + # Require `training_script` to be set as `ipu` + # Require `training_script_args` as the arguments of IPU distributed training instead of the arguments of the training program/script + # Please Check the `IPU Parameters` for details python -m paddle.distributed.launch --devices 4 ipu --hosts=localhost --nproc_per_host=2 --ipus_per_replica=1 --ipu_partition=pod16 --vipu_server=127.0.0.1 train.py """ From 30846bc9ccd15ef8a62acff7277034b76b4e2a4a Mon Sep 17 00:00:00 2001 From: chenjian Date: Mon, 4 Jul 2022 20:58:08 +0800 Subject: [PATCH 052/250] fix op filter rule (#44063) --- python/paddle/profiler/profiler_statistic.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index f33335c907d7a..c1dd9c48fb5a1 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -79,19 +79,14 @@ def __init__(self, hostnode): self.self_gpu_time = 0 self.general_gpu_time = 0 # besides kernel, include time of gpu events like memcpy and memset self.self_general_gpu_time = 0 - self.is_terminal_operator_node = True def cal_statistic(self): for child in self.children_node: child.cal_statistic() - if child.is_terminal_operator_node == False: - self.is_terminal_operator_node = False for rt in self.runtime_node: rt.cal_statistic() self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns for child in self.children_node: - if child.type == TracerEventType.Operator: - self.is_terminal_operator_node = False self.gpu_time += child.gpu_time self.general_gpu_time += child.general_gpu_time self.self_cpu_time -= (child.end_ns - child.start_ns) @@ -421,10 +416,11 @@ def add_item(self, node): self.add_gpu_time(node.gpu_time) self.add_general_gpu_time(node.general_gpu_time) for child in node.children_node: - if child.name not in self.operator_inners: - self.operator_inners[ - child.name] = EventSummary.OperatorItem(child.name) - self.operator_inners[child.name].add_item(child) + if child.type != TracerEventType.Operator: + if child.name not in self.operator_inners: + self.operator_inners[ + child.name] = EventSummary.OperatorItem(child.name) + self.operator_inners[child.name].add_item(child) for runtimenode in node.runtime_node: for devicenode in runtimenode.device_node: @@ -537,8 +533,6 @@ def parse(self, nodetrees): deque.append(child) def add_operator_item(self, operator_node): - if operator_node.is_terminal_operator_node == False: - return if operator_node.name not in self.items: self.items[operator_node.name] = EventSummary.OperatorItem( operator_node.name) From 7a212593946333ff22d7bd36cb1df332945367ed Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Mon, 4 Jul 2022 23:05:37 +0800 Subject: [PATCH 053/250] fix delete_quant_dequant_op_pass (#44046) --- paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc index 40861638a2ab2..e0d490ce83680 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc @@ -89,6 +89,7 @@ void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const { std::string quantized_op_type = op_desc->Type(); op_desc->SetAttr("Input_scale", input_scale); op_desc->SetAttr("bit_length", bit_length); + op_desc->SetAttr("enable_int8", true); op_desc->RenameInput(quant_dequant_output_name, input_name); op_desc->Flush(); IR_NODE_LINK_TO(input, quantized_node); From c10aa24f604f3bf4256c89d41d875f4d8ebff2a2 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Tue, 5 Jul 2022 09:34:38 +0800 Subject: [PATCH 054/250] [Dy2St]Add BaseTransformer for dy2st error message (#44054) * Add BaseTransformer for dy2st error message * Fix return_transformer error * Polish dy2st error info in runtime * Fix UT error * Polish runtime error code --- .../dygraph_to_static/assert_transformer.py | 3 +- .../dygraph_to_static/ast_transformer.py | 3 +- .../dygraph_to_static/base_transformer.py | 38 +++++++++++++++++++ .../basic_api_transformer.py | 5 ++- .../break_continue_transformer.py | 3 +- .../dygraph_to_static/call_transformer.py | 3 +- .../dygraph_to_static/cast_transformer.py | 3 +- .../early_return_transformer.py | 3 +- .../fluid/dygraph/dygraph_to_static/error.py | 14 +++++-- .../dygraph_to_static/grad_transformer.py | 3 +- .../dygraph_to_static/ifelse_transformer.py | 3 +- .../dygraph_to_static/list_transformer.py | 4 +- .../dygraph_to_static/logical_transformer.py | 3 +- .../dygraph_to_static/loop_transformer.py | 3 +- .../dygraph_to_static/print_transformer.py | 3 +- .../dygraph_to_static/return_transformer.py | 5 ++- .../tensor_shape_transformer.py | 3 +- .../fluid/dygraph/dygraph_to_static/utils.py | 7 +++- .../test_program_translator.py | 9 +++-- 19 files changed, 92 insertions(+), 26 deletions(-) create mode 100644 python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py index 3d5ca1c136816..57d952fd6bb73 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py @@ -18,9 +18,10 @@ from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer -class AssertTransformer(gast.NodeTransformer): +class AssertTransformer(BaseTransformer): """ A class transforms python assert to convert_assert. """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py index aa01945ac849e..ab4133099eaf3 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py @@ -20,6 +20,7 @@ # See details in https://github.com/serge-sans-paille/gast/ import os from paddle.utils import gast +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer from paddle.fluid.dygraph.dygraph_to_static.early_return_transformer import EarlyReturnTransformer from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer @@ -58,7 +59,7 @@ def apply_optimization(transformers): transformers.insert(3, BreakTransformOptimizer) -class DygraphToStaticAst(gast.NodeTransformer): +class DygraphToStaticAst(BaseTransformer): """ Main class to transform Dygraph to Static Graph """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py new file mode 100644 index 0000000000000..127a8e9232422 --- /dev/null +++ b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py @@ -0,0 +1,38 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.utils import gast + +from paddle.fluid.dygraph.dygraph_to_static.origin_info import ORIGI_INFO + + +class BaseTransformer(gast.NodeTransformer): + + def visit(self, node): + if not isinstance(node, gast.AST): + msg = ('Expected "gast.AST", but got "{}".').format(type(node)) + raise ValueError(msg) + origin_info = getattr(node, ORIGI_INFO, None) + + result = super(BaseTransformer, self).visit(node) + + iter_result = result + if iter_result is not node and iter_result is not None: + if not isinstance(iter_result, (list, tuple)): + iter_result = (iter_result, ) + if origin_info is not None: + for n in iter_result: + setattr(n, ORIGI_INFO, origin_info) + + return result diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py index acf2c3ec09b5d..2293071c7cd17 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py @@ -17,9 +17,10 @@ from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static import utils +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer -class BasicApiTransformer(gast.NodeTransformer): +class BasicApiTransformer(BaseTransformer): """ Class to transform basic API from dygraph to static graph. """ @@ -98,7 +99,7 @@ def _update_class_node_dict(self, node): return False -class ToTensorTransformer(gast.NodeTransformer): +class ToTensorTransformer(BaseTransformer): """ Class to transform paddle.to_tensor and paddle.to_variable to paddle.assign """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py index 7bce234168c7e..020721e85a235 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py @@ -21,6 +21,7 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import ForNodeVisitor from paddle.fluid.dygraph.dygraph_to_static.utils import BaseNodeVisitor from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_bool_node +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer __all__ = ['BreakContinueTransformer'] @@ -28,7 +29,7 @@ CONTINUE_NAME_PREFIX = '__continue' -class ForToWhileTransformer(gast.NodeTransformer): +class ForToWhileTransformer(BaseTransformer): """ Transform python for loop into while loop and add condition node in the loop test diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py index b14977ced1db5..c9f56287ed3c5 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py @@ -18,11 +18,12 @@ from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code from paddle.fluid.dygraph.dygraph_to_static.utils import is_paddle_api +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer PDB_SET = "pdb.set_trace" -class CallTransformer(gast.NodeTransformer): +class CallTransformer(BaseTransformer): """ This class transforms function calls into Static Graph Ast. """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py index 3b2d9be99ff00..a297d5cf56ed1 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py @@ -17,9 +17,10 @@ from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer -class CastTransformer(gast.NodeTransformer): +class CastTransformer(BaseTransformer): """ This class transforms type casting into Static Graph Ast. """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py index bef1efb0427cf..9cf82b020994e 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/early_return_transformer.py @@ -16,9 +16,10 @@ from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer -class EarlyReturnTransformer(gast.NodeTransformer): +class EarlyReturnTransformer(BaseTransformer): """ Transform if/else return statement of Dygraph into Static Graph. """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py index c422c5269e75d..3b868ade4e29b 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py @@ -274,19 +274,25 @@ def _simplify_error_value(self): bottom_error_message = error_value_lines[empty_line_idx + 1:] revise_suggestion = self._create_revise_suggestion(bottom_error_message) - user_filepath = '' error_traceback = [] user_code_traceback_index = [] pattern = 'File "(?P.+)", line (?P.+), in (?P.+)' + + # Distinguish user code and framework code using static_info_map + static_info_map = {} + for k, v in self.origin_info_map.items(): + origin_filepath = v.location.filepath + origin_lineno = v.location.lineno + static_info_map[(origin_filepath, origin_lineno)] = k + for i in range(0, len(error_value_lines_strip), 2): if error_value_lines_strip[i].startswith("File "): re_result = re.search(pattern, error_value_lines_strip[i]) tmp_filepath, lineno_str, function_name = re_result.groups() code = error_value_lines_strip[ i + 1] if i + 1 < len(error_value_lines_strip) else '' - if i == 0: - user_filepath = tmp_filepath - if tmp_filepath == user_filepath: + + if static_info_map.get((tmp_filepath, int(lineno_str))): user_code_traceback_index.append(len(error_traceback)) error_traceback.append( diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py index d8d8d0bc043dd..09125623e16a5 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py @@ -19,9 +19,10 @@ from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static import utils +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer -class GradTransformer(gast.NodeTransformer): +class GradTransformer(BaseTransformer): """ A class transforms dygraph paddle.grad to static graph paddle.gradients. The transformation is applied to support double grad mode. diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py index d4449f6dfc24e..13ac63f91057f 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py @@ -33,6 +33,7 @@ from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_node from paddle.fluid.dygraph.dygraph_to_static.utils import create_get_args_node, create_set_args_node +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer TRUE_FUNC_PREFIX = 'true_fn' FALSE_FUNC_PREFIX = 'false_fn' @@ -41,7 +42,7 @@ ARGS_NAME = '__args' -class IfElseTransformer(gast.NodeTransformer): +class IfElseTransformer(BaseTransformer): """ Transform if/else statement of Dygraph into Static Graph. """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py index 48fa9906828c0..e29ec6c6e1d73 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py @@ -21,11 +21,11 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code from paddle.fluid.dygraph.dygraph_to_static.utils import slice_is_num from paddle.fluid.dygraph.dygraph_to_static.utils import is_control_flow_to_transform - from paddle.fluid.dygraph.dygraph_to_static.utils import SplitAssignTransformer +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer -class ListTransformer(gast.NodeTransformer): +class ListTransformer(BaseTransformer): """ This class transforms python list used in control flow into Static Graph Ast. """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py index 80f5bffe46d1b..3e9a56b0e74dd 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py @@ -16,6 +16,7 @@ from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer cmpop_type_to_str = { gast.Eq: "==", @@ -35,7 +36,7 @@ def cmpop_node_to_str(node): return cmpop_type_to_str[type(node)] -class LogicalTransformer(gast.NodeTransformer): +class LogicalTransformer(BaseTransformer): """ Transform python boolean op into Paddle logical op. diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index 63fc4f0489acb..0485e5abbdf96 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -32,6 +32,7 @@ from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_node, create_get_args_node, create_set_args_node from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import ARGS_NAME +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer __all__ = ['LoopTransformer', 'NameVisitor'] @@ -566,7 +567,7 @@ def filter_name_nodes_from(root_node, target_var_names): return loop_vars - removed_vars -class LoopTransformer(gast.NodeTransformer): +class LoopTransformer(BaseTransformer): """ This class transforms python while/for statement into Static Graph Ast """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py index d7a889ad2fc9c..8615b3596e081 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py @@ -17,9 +17,10 @@ from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer -class PrintTransformer(gast.NodeTransformer): +class PrintTransformer(BaseTransformer): """ This class transforms python print function to fluid.layers.Print. """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py index 7e387b45c4020..072d22d47e029 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py @@ -21,6 +21,7 @@ from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import ForToWhileTransformer from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer __all__ = [ 'RETURN_NO_VALUE_MAGIC_NUM', 'RETURN_NO_VALUE_VAR_NAME', 'ReturnTransformer' @@ -57,7 +58,7 @@ def get_return_size(return_node): return return_length -class ReplaceReturnNoneTransformer(gast.NodeTransformer): +class ReplaceReturnNoneTransformer(BaseTransformer): """ Replace 'return None' to 'return' because 'None' cannot be a valid input in control flow. In ReturnTransformer single 'Return' will be appended no @@ -133,7 +134,7 @@ def get_func_max_return_length(self, func_node): return self.max_return_length[func_node] -class ReturnTransformer(gast.NodeTransformer): +class ReturnTransformer(BaseTransformer): """ Transforms return statements into equivalent python statements containing only one return statement at last. The basics idea is using a return value diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py index 5604a634a171b..88ece85cd139e 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py @@ -18,9 +18,10 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer -class TensorShapeTransformer(gast.NodeTransformer): +class TensorShapeTransformer(BaseTransformer): """ This class transforms variable.shape into Static Graph Ast. All 'xxx.shape' will be converted int '_jst.Shape(x)'. diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index b51635b85f945..2191046ad1d3e 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -644,7 +644,12 @@ def ast_to_source_code(ast_node): type(ast_node)) if isinstance(ast_node, gast.AST): ast_node = gast.gast_to_ast(ast_node) - source_code = astor.to_source(ast_node) + + # Do not wrap lines even if they are too long + def pretty_source(source): + return ''.join(source) + + source_code = astor.to_source(ast_node, pretty_source=pretty_source) return source_code diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py index 8d2665129e94e..13399b63e3292 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py @@ -196,14 +196,17 @@ def test_decorator(self): program_translator = ProgramTranslator() code = program_translator.get_code(dyfunc_with_if_else) answer = get_source_code(StaticCode1.dyfunc_with_if_else) - self.assertEqual(answer, code) + self.assertEqual( + answer.replace('\n', '').replace(' ', ''), + code.replace('\n', '').replace(' ', '')) def test_program_translator(self): answer = get_source_code(StaticCode2.dyfunc_with_if_else) program_translator = ProgramTranslator() code = program_translator.get_code(dyfunc_with_if_else) - # print(code) - self.assertEqual(answer, code) + self.assertEqual( + answer.replace('\n', '').replace(' ', ''), + code.replace('\n', '').replace(' ', '')) class TestEnableDeclarative(unittest.TestCase): From 30038ba3ec980e9c1e353166f2ccc6ade613e590 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Tue, 5 Jul 2022 09:49:50 +0800 Subject: [PATCH 055/250] Fix conv3d_grad mapping (#44059) * dataloader * fix conv3d mapping bug * fix conv3d_grad mapping --- paddle/phi/ops/compat/conv3d_sig.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/ops/compat/conv3d_sig.cc b/paddle/phi/ops/compat/conv3d_sig.cc index 49f31288d00f6..68bd54609cb03 100644 --- a/paddle/phi/ops/compat/conv3d_sig.cc +++ b/paddle/phi/ops/compat/conv3d_sig.cc @@ -32,7 +32,7 @@ KernelSignature Conv3dOpArgumentMapping(const ArgumentMappingContext& ctx) { } KernelSignature Conv3dGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("conv2d_grad", + return KernelSignature("conv3d_grad", {"Input", "Filter", "Output@GRAD"}, {"strides", "paddings", From 52607cf8f12266b9d6069f3ff4db8e5c5385f71e Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Tue, 5 Jul 2022 10:51:53 +0800 Subject: [PATCH 056/250] Remove header file including for boost (#44052) --- paddle/fluid/framework/feed_fetch_method.cc | 1 - paddle/fluid/framework/feed_fetch_type.h | 1 - paddle/fluid/framework/ir/cost_model.h | 1 - paddle/fluid/framework/ir/graph.h | 2 +- paddle/fluid/framework/ir/pass.h | 2 +- paddle/fluid/framework/op_version_registry.h | 1 + paddle/fluid/framework/operator.h | 2 +- paddle/fluid/framework/tuple.h | 1 - paddle/fluid/framework/type_defs.h | 2 +- paddle/fluid/inference/analysis/argument.h | 2 +- paddle/fluid/memory/stats.cc | 2 +- .../fluid/operators/controlflow/op_variant.h | 1 - .../controlflow/recurrent_op_helper.h | 2 +- .../operators/controlflow/while_op_helper.h | 1 - paddle/fluid/operators/jit/benchmark.cc | 1 - paddle/fluid/operators/jit/registry.h | 2 +- paddle/fluid/operators/math/matrix_bit_code.h | 2 +- paddle/fluid/platform/enforce.h | 2 +- paddle/fluid/platform/flags.h | 2 +- paddle/fluid/platform/monitor.h | 2 + paddle/fluid/platform/place.h | 2 +- paddle/fluid/platform/variant.h | 53 ------------------- paddle/fluid/pybind/data_set_py.cc | 2 +- paddle/fluid/pybind/fleet_wrapper_py.cc | 2 +- .../pybind/global_value_getter_setter.cc | 1 + paddle/fluid/pybind/metrics_py.cc | 2 +- paddle/fluid/pybind/nccl_wrapper_py.cc | 2 +- paddle/fluid/pybind/protobuf.h | 1 - paddle/utils/tribool.h | 25 +-------- .../custom_kernel_dot_c_setup.py | 1 - .../custom_kernel/custom_kernel_dot_setup.py | 1 - tools/prune_for_jetson.py | 2 +- 32 files changed, 22 insertions(+), 104 deletions(-) delete mode 100644 paddle/fluid/platform/variant.h diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 36ab906181be5..47bb60810eb48 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" -#include #include #include "glog/logging.h" diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h index 8ecd6a0339b5b..3fe545ec9c569 100644 --- a/paddle/fluid/framework/feed_fetch_type.h +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/string_array.h" -#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/cost_model.h b/paddle/fluid/framework/ir/cost_model.h index 27f059a81eb1f..558f158e84e72 100644 --- a/paddle/fluid/framework/ir/cost_model.h +++ b/paddle/fluid/framework/ir/cost_model.h @@ -27,7 +27,6 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" -#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index ea5c46e3040bd..5a954110775d6 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/utils/any.h" DECLARE_bool(convert_all_blocks); diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 967482d2419e9..37a28bec16da2 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/platform/variant.h" +#include "paddle/phi/core/macros.h" #include "paddle/utils/any.h" namespace paddle { diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h index c88b947edc686..579dd320d144f 100644 --- a/paddle/fluid/framework/op_version_registry.h +++ b/paddle/fluid/framework/op_version_registry.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_proto.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/macros.h" #include "paddle/utils/none.h" namespace paddle { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index d468ead659258..1b7bd433dd104 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -38,7 +38,7 @@ limitations under the License. */ #include "paddle/fluid/framework/unused_var_check.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/phi/core/compat/arg_map_context.h" #include "paddle/phi/core/compat/op_utils.h" #include "paddle/phi/core/kernel_factory.h" diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h index 6c283f4d32e57..a06f92f32d28c 100644 --- a/paddle/fluid/framework/tuple.h +++ b/paddle/fluid/framework/tuple.h @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 5c768b10a3d7e..3bcad63f21a84 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -23,7 +23,7 @@ limitations under the License. */ #include #include "paddle/fluid/imperative/type_defs.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/utils/blank.h" #include "paddle/utils/small_vector.h" #include "paddle/utils/variant.h" diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 016df40c86a2d..e69a1e0e1ffb0 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -35,7 +35,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_analysis_config.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/phi/common/data_type.h" namespace paddle { diff --git a/paddle/fluid/memory/stats.cc b/paddle/fluid/memory/stats.cc index 12312a28f6c2a..0289859dff30e 100644 --- a/paddle/fluid/memory/stats.cc +++ b/paddle/fluid/memory/stats.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/memory/stats.h" #include "paddle/fluid/memory/allocation/spin_lock.h" -#include "paddle/fluid/platform/variant.h" +#include "paddle/phi/core/macros.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h index c75294ce9ab7a..04afe548e92e3 100644 --- a/paddle/fluid/operators/controlflow/op_variant.h +++ b/paddle/fluid/operators/controlflow/op_variant.h @@ -18,7 +18,6 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.h b/paddle/fluid/operators/controlflow/recurrent_op_helper.h index 78fabec56f51a..752a0a1f764eb 100644 --- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h +++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/controlflow/op_variant.h" #include "paddle/fluid/operators/recurrent_op.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/fluid/string/string_helper.h" namespace paddle { diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h index 46c3b056bfdf1..8f7db23769a7e 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.h +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -20,7 +20,6 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/controlflow/op_variant.h" -#include "paddle/fluid/platform/variant.h" namespace phi { class DenseTensor; diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 7ffdd6ff32ba7..50fd6056d84b0 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -22,7 +22,6 @@ #include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/variant.h" // for UNUSED DEFINE_int32(burning, 10, "Burning times."); DEFINE_int32(repeat, 3000, "Repeat times."); diff --git a/paddle/fluid/operators/jit/registry.h b/paddle/fluid/operators/jit/registry.h index b006d21f3b558..9d0e47e826075 100644 --- a/paddle/fluid/operators/jit/registry.h +++ b/paddle/fluid/operators/jit/registry.h @@ -22,7 +22,7 @@ #include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_pool.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/variant.h" // for UNUSED +#include "paddle/phi/core/macros.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 780003c1b451e..7c9d94aa8713b 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/phi/kernels/funcs/blas/blas.h" #if defined(_WIN32) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 4f26ce0b27dbf..6b33af9ac10ba 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -66,7 +66,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/macros.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/to_string.h" #include "paddle/phi/backends/dynload/port.h" diff --git a/paddle/fluid/platform/flags.h b/paddle/fluid/platform/flags.h index 03986816c53f9..6db5e710b8dc8 100644 --- a/paddle/fluid/platform/flags.h +++ b/paddle/fluid/platform/flags.h @@ -21,7 +21,7 @@ #include "gflags/gflags.h" #include "paddle/fluid/platform/macros.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/utils/variant.h" namespace paddle { diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h index e7612f6dcb6cd..a0c1129d2cb87 100644 --- a/paddle/fluid/platform/monitor.h +++ b/paddle/fluid/platform/monitor.h @@ -26,6 +26,8 @@ #include "glog/logging.h" +#include "paddle/phi/core/macros.h" + namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index d544cdecc3994..cde17007715a6 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -18,7 +18,7 @@ limitations under the License. */ // #include #include "paddle/fluid/platform/enforce.h" -// #include "paddle/fluid/platform/variant.h" +// #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/enforce_npu.h" #endif diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h deleted file mode 100644 index 9682749898fc7..0000000000000 --- a/paddle/fluid/platform/variant.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// Boost 1.41.0 requires __CUDACC_VER__, but in CUDA 9 __CUDACC_VER__ -// is removed, so we have to manually define __CUDACC_VER__ instead. -// For details, please refer to -// https://github.com/PaddlePaddle/Paddle/issues/6626 -#if defined(__CUDACC__) && defined(__CUDACC_VER_MAJOR__) -#undef __CUDACC_VER__ -#define __CUDACC_VER__ \ - __CUDACC_VER_BUILD__ + __CUDACC_VER_MAJOR__ * 10000 + \ - __CUDACC_VER_MINOR__ * 100 -#endif - -#include "boost/config.hpp" - -// Because Boost 1.41.0's variadic templates has bug on nvcc, boost -// will disable variadic template support in NVCC mode. Define -// BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same -// function symbols. For details, -// https://github.com/PaddlePaddle/Paddle/issues/3386 -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES -#define BOOST_NO_CXX11_VARIADIC_TEMPLATES -#endif -#endif - -#include -#include - -#include "paddle/utils/any.h" -#include "paddle/utils/optional.h" - -// some platform-independent defintion -#if defined(_WIN32) -#define UNUSED -#define __builtin_expect(EXP, C) (EXP) -#else -#define UNUSED __attribute__((unused)) -#endif diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index d5f84d7382105..e1950ade92fb2 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -35,7 +35,7 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/fluid/pybind/data_set_py.h" namespace py = pybind11; diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc index 05028a9b70efb..8626659d8633a 100644 --- a/paddle/fluid/pybind/fleet_wrapper_py.cc +++ b/paddle/fluid/pybind/fleet_wrapper_py.cc @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/fluid/pybind/fleet_wrapper_py.h" namespace py = pybind11; diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index b2a52e568aed9..c45566ba35673 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -27,6 +27,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/core/macros.h" #include "pybind11/stl.h" // FIXME(zengjinle): these 2 flags may be removed by the linker when compiling diff --git a/paddle/fluid/pybind/metrics_py.cc b/paddle/fluid/pybind/metrics_py.cc index 50318cf9e6fc4..78e6d528b1af3 100644 --- a/paddle/fluid/pybind/metrics_py.cc +++ b/paddle/fluid/pybind/metrics_py.cc @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/fluid/pybind/metrics_py.h" namespace py = pybind11; diff --git a/paddle/fluid/pybind/nccl_wrapper_py.cc b/paddle/fluid/pybind/nccl_wrapper_py.cc index bbba03f6660fe..827bcaf39704d 100644 --- a/paddle/fluid/pybind/nccl_wrapper_py.cc +++ b/paddle/fluid/pybind/nccl_wrapper_py.cc @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/variant.h" + #include "paddle/fluid/pybind/nccl_wrapper_py.h" namespace py = pybind11; diff --git a/paddle/fluid/pybind/protobuf.h b/paddle/fluid/pybind/protobuf.h index 54b788cccba5b..79f174b5eb607 100644 --- a/paddle/fluid/pybind/protobuf.h +++ b/paddle/fluid/pybind/protobuf.h @@ -22,7 +22,6 @@ typedef SSIZE_T ssize_t; #include #include -#include "paddle/fluid/platform/variant.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" diff --git a/paddle/utils/tribool.h b/paddle/utils/tribool.h index 98a5019d71535..9ede76f3ec15e 100644 --- a/paddle/utils/tribool.h +++ b/paddle/utils/tribool.h @@ -18,6 +18,7 @@ // 2. remove the depending boost header files // 3. remove the dummy_ in indeterminate_t, which is specially implemented for // Borland C++ Builder +// 4. remove unnecessary macro BOOST_TRIBOOL_THIRD_STATE // Three-state boolean logic library @@ -437,27 +438,3 @@ namespace paddle { using logic::indeterminate; using logic::tribool; } // namespace paddle - -/** - * \brief Declare a new name for the third state of a tribool - * - * Use this macro to declare a new name for the third state of a - * tribool. This state can have any number of new names (in addition - * to \c indeterminate), all of which will be equivalent. The new name will be - * placed in the namespace in which the macro is expanded. - * - * Example: - * PADDLE_TRIBOOL_THIRD_STATE(true_or_false) - * - * tribool x(true_or_false); - * // potentially set x - * if (true_or_false(x)) { - * // don't know what x is - * } - */ -#define PADDLE_TRIBOOL_THIRD_STATE(Name) \ - inline bool Name(boost::logic::tribool x, \ - boost::logic::detail::indeterminate_t dummy = \ - boost::logic::detail::indeterminate_t()) { \ - return x.value == boost::logic::tribool::indeterminate_value; \ - } diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py index a94307161d431..11fdc9d0addfa 100644 --- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py +++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py @@ -51,7 +51,6 @@ def build_extensions(self): compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'], 'build/third_party') paddle_custom_kernel_include += [ - os.path.join(compile_third_party_path, 'boost/src/extern_boost'), # boost os.path.join(compile_third_party_path, 'install/gflags/include'), # gflags os.path.join(compile_third_party_path, 'install/glog/include'), # glog ] diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py index 94de1a39ccfbb..8147fc3d343d6 100644 --- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py +++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py @@ -53,7 +53,6 @@ def build_extensions(self): compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'], 'build/third_party') paddle_custom_kernel_include += [ - os.path.join(compile_third_party_path, 'boost/src/extern_boost'), # boost os.path.join(compile_third_party_path, 'install/gflags/include'), # gflags os.path.join(compile_third_party_path, 'install/glog/include'), # glog ] diff --git a/tools/prune_for_jetson.py b/tools/prune_for_jetson.py index d53b21d6c3723..cbc03393360d2 100644 --- a/tools/prune_for_jetson.py +++ b/tools/prune_for_jetson.py @@ -125,7 +125,7 @@ def append_fluid_kernels(): with io.open(file_name, 'r', encoding='utf-8') as f: content = ''.join(f.readlines()) - location_str = "nv_library(\n tensorrt_op_teller\n SRCS op_teller.cc\n DEPS framework_proto device_context boost)" + location_str = "nv_library(\n tensorrt_op_teller\n SRCS op_teller.cc\n DEPS framework_proto device_context)" new_content = content.replace(location_str, location_str + append_str) if new_content == content: From b918d063275a7cbad245bfc20dc1d0cc4c723db8 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Tue, 5 Jul 2022 10:55:03 +0800 Subject: [PATCH 057/250] refine tensor.dtype print formate for bfloat16 (#44055) * refine tensor.dtype for bloat16 * refine test * revert * refine bfloat16 print --- python/paddle/fluid/dygraph/varbase_patch_methods.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index 9eb044188f0d1..48497f4b9092f 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -1039,8 +1039,11 @@ def to_sparse_coo(self, sparse_dim): def dtype_str(dtype): if dtype in _PADDLE_DTYPE_2_NUMPY_DTYPE: + numpy_dtype = _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype] + if numpy_dtype == 'uint16': + numpy_dtype = 'bfloat16' prefix = 'paddle.' - return prefix + _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype] + return prefix + numpy_dtype else: # for example, paddle.fluid.core.VarDesc.VarType.LOD_TENSOR return origin(dtype) From 5fbc26e282928d0bff08551d00cd0dd92cd3db07 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Tue, 5 Jul 2022 14:33:54 +0800 Subject: [PATCH 058/250] make Linux and windows both use external/cub 1.16.0 (#44004) * make only win32 and 11.6 use external/cub * unify cub version in linux and windows when cuda >= 11.6 --- cmake/external/cub.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake index c6b435288e37e..de66e8d63d069 100644 --- a/cmake/external/cub.cmake +++ b/cmake/external/cub.cmake @@ -24,7 +24,7 @@ set(CUB_PREFIX_DIR ${CUB_PATH}) set(CUB_REPOSITORY ${GIT_URL}/NVlabs/cub.git) -if(WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6) +if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6) # cuda_11.6.2_511.65‘s own cub is 1.15.0, which will cause compiling error in windows. set(CUB_TAG 1.16.0) # cub 1.16.0 is not compitable with current thrust version From ca9339eb1abd9ccab3c22c8d8f00a4afb1bd572b Mon Sep 17 00:00:00 2001 From: zhiboniu <31800336+zhiboniu@users.noreply.github.com> Date: Tue, 5 Jul 2022 14:37:27 +0800 Subject: [PATCH 059/250] replace fluid.mean to paddle.mean (#43907) * change fluid.mean to paddle.mean * reverse some old code examples --- .../slim/tests/imperative_test_utils.py | 2 +- .../fluid/contrib/slim/tests/test_graph.py | 2 +- .../contrib/slim/tests/test_imperative_qat.py | 2 +- .../slim/tests/test_imperative_qat_amp.py | 4 +- .../tests/test_quantization_mkldnn_pass.py | 2 +- .../slim/tests/test_quantization_pass.py | 8 +- .../tests/test_quantization_scale_pass.py | 2 +- .../slim/tests/test_quantize_transpiler_v2.py | 2 +- .../tests/test_user_defined_quantization.py | 2 +- .../tests/test_image_classification_fp16.py | 4 +- .../contrib/tests/test_quantize_transpiler.py | 6 +- .../contrib/tests/test_weight_decay_extend.py | 2 +- python/paddle/fluid/dygraph/jit.py | 2 +- .../fluid/dygraph/learning_rate_scheduler.py | 2 +- .../incubate/fleet/tests/fleet_deep_ctr.py | 3 +- python/paddle/fluid/install_check.py | 2 +- python/paddle/fluid/io.py | 6 +- python/paddle/fluid/layers/nn.py | 2 +- python/paddle/fluid/optimizer.py | 4 +- .../tests/book/notest_understand_sentiment.py | 6 +- .../fluid/tests/book/test_fit_a_line.py | 6 +- .../tests/book/test_image_classification.py | 2 +- .../tests/book/test_label_semantic_roles.py | 2 +- .../fluid/tests/book/test_recognize_digits.py | 2 +- .../tests/book/test_recommender_system.py | 2 +- .../tests/book/test_rnn_encoder_decoder.py | 2 +- .../fluid/tests/book/test_word2vec_book.py | 2 +- .../fluid/tests/test_beam_search_decoder.py | 2 +- python/paddle/fluid/tests/test_error_clip.py | 2 +- python/paddle/fluid/tests/test_if_else_op.py | 4 +- .../tests/unittests/asp/asp_pruning_base.py | 2 +- .../asp/test_asp_customized_pruning.py | 2 +- .../unittests/asp/test_asp_optimize_static.py | 2 +- .../unittests/asp/test_asp_pruning_static.py | 2 +- .../tests/unittests/asp/test_asp_save_load.py | 2 +- .../unittests/auto_parallel_parallelizer.py | 2 +- .../tests/unittests/check_nan_inf_base.py | 2 +- .../tests/unittests/dist_allreduce_op.py | 2 +- .../paddle/fluid/tests/unittests/dist_ctr.py | 2 +- .../fluid/tests/unittests/dist_fleet_ctr.py | 2 +- .../dist_fleet_heter_pipeline_ctr.py | 2 +- .../dist_fleet_raw_program_optimizer.py | 2 +- ...et_raw_program_optimizer_fuse_allreduce.py | 2 +- .../tests/unittests/dist_fleet_simnet_bow.py | 2 +- .../dist_fleet_sparse_embedding_ctr.py | 2 +- .../fluid/tests/unittests/dist_mnist.py | 2 +- .../tests/unittests/dist_mnist_batch_merge.py | 2 +- .../unittests/dist_mnist_fp16_allreduce.py | 2 +- .../unittests/dist_mnist_gradient_merge.py | 2 +- .../fluid/tests/unittests/dist_mnist_lars.py | 2 +- .../fluid/tests/unittests/dist_se_resnext.py | 2 +- .../tests/unittests/dist_sharding_save.py | 2 +- .../unittests/dist_text_classification.py | 2 +- .../fluid/tests/unittests/dist_word2vec.py | 2 +- .../dygraph_to_static/bert_dygraph_model.py | 6 +- .../dygraph_to_static/ifelse_simple_func.py | 26 +- .../dygraph_to_static/test_ast_util.py | 3 +- .../unittests/dygraph_to_static/test_bmn.py | 4 +- .../dygraph_to_static/test_cache_program.py | 4 +- .../dygraph_to_static/test_convert_call.py | 4 +- .../unittests/dygraph_to_static/test_error.py | 6 +- .../dygraph_to_static/test_fetch_feed.py | 4 +- .../dygraph_to_static/test_full_name_usage.py | 5 +- .../dygraph_to_static/test_ifelse.py | 4 +- .../unittests/dygraph_to_static/test_lac.py | 2 +- .../dygraph_to_static/test_lambda.py | 11 +- .../unittests/dygraph_to_static/test_mnist.py | 2 +- .../dygraph_to_static/test_mobile_net.py | 2 +- .../dygraph_to_static/test_partial_program.py | 2 +- .../test_program_translator.py | 8 +- .../dygraph_to_static/test_resnet.py | 2 +- .../dygraph_to_static/test_resnet_amp.py | 2 +- .../test_resnet_pure_fp16.py | 2 +- .../test_save_inference_model.py | 2 +- .../dygraph_to_static/test_se_resnet.py | 2 +- .../dygraph_to_static/test_sentiment.py | 8 +- .../unittests/dygraph_to_static/test_tsm.py | 2 +- .../unittests/fleet_heter_ps_training.py | 2 +- .../unittests/fleet_meta_optimizer_base.py | 4 +- .../tests/unittests/ipu/test_mean_op_ipu.py | 2 +- .../test_trt_conv_quant_dequant_pass.py | 7 +- .../test_trt_fc_fuse_quant_dequant_pass.py | 7 +- .../test_trt_matmul_quant_dequant.py | 7 +- .../fluid/tests/unittests/ir/pass_test.py | 4 +- .../ir/test_ir_subgraph_python_interface.py | 2 +- .../unittests/mlu/test_momentum_op_mlu.py | 6 +- .../tests/unittests/mlu/test_where_op_mlu.py | 2 +- .../unittests/npu/test_momentum_op_npu.py | 4 +- .../unittests/npu/test_softmax_op_npu.py | 2 +- .../tests/unittests/npu/test_where_op_npu.py | 2 +- .../tests/unittests/npu/test_while_op_npu.py | 2 +- .../tests/unittests/parallel_dygraph_mnist.py | 2 +- .../unittests/parallel_dygraph_se_resnext.py | 2 +- .../parallel_dygraph_sync_batch_norm.py | 2 +- .../fluid/tests/unittests/pipeline_mnist.py | 2 +- .../unittests/pipeline_mnist_multi_device.py | 2 +- .../unittests/pipeline_mnist_one_device.py | 2 +- .../fluid/tests/unittests/seresnext_net.py | 3 +- .../fluid/tests/unittests/simple_nets.py | 7 +- .../fluid/tests/unittests/test_adadelta_op.py | 2 +- .../fluid/tests/unittests/test_adam_op.py | 4 +- .../test_adam_optimizer_fp32_fp64.py | 2 +- .../fluid/tests/unittests/test_adamw_op.py | 2 +- .../unittests/test_array_read_write_op.py | 12 +- .../fluid/tests/unittests/test_assign_op.py | 4 +- .../test_async_ssa_graph_executor_mnist.py | 2 +- .../fluid/tests/unittests/test_backward.py | 6 +- .../tests/unittests/test_calc_gradient.py | 2 +- .../paddle/fluid/tests/unittests/test_case.py | 5 +- .../unittests/test_communicator_async.py | 2 +- .../tests/unittests/test_communicator_geo.py | 2 +- .../unittests/test_communicator_half_async.py | 2 +- .../unittests/test_communicator_ps_gpu.py | 2 +- .../tests/unittests/test_communicator_sync.py | 2 +- .../tests/unittests/test_compiled_program.py | 2 +- .../paddle/fluid/tests/unittests/test_cond.py | 8 +- .../tests/unittests/test_conditional_block.py | 3 +- .../fluid/tests/unittests/test_dataset.py | 6 +- .../unittests/test_decoupled_py_reader.py | 2 +- .../fluid/tests/unittests/test_desc_clone.py | 8 +- .../test_dist_fleet_a_sync_optimizer_async.py | 4 +- .../test_dist_fleet_a_sync_optimizer_auto.py | 2 +- ..._dist_fleet_a_sync_optimizer_auto_async.py | 2 +- ...st_dist_fleet_a_sync_optimizer_auto_geo.py | 2 +- .../test_dist_fleet_a_sync_optimizer_geo.py | 4 +- .../test_dist_fleet_a_sync_optimizer_sync.py | 2 +- .../tests/unittests/test_dist_fleet_ps.py | 2 +- .../tests/unittests/test_dist_fleet_ps11.py | 2 +- .../tests/unittests/test_dist_fleet_ps12.py | 2 +- .../tests/unittests/test_dist_fleet_ps2.py | 2 +- .../tests/unittests/test_dist_fleet_ps3.py | 2 +- .../tests/unittests/test_dist_fleet_ps4.py | 2 +- .../tests/unittests/test_dist_fleet_ps5.py | 2 +- .../tests/unittests/test_dist_fleet_ps6.py | 2 +- .../test_dist_fleet_trainer_desc_config.py | 2 +- .../unittests/test_dist_mnist_fleetapi.py | 2 +- .../tests/unittests/test_dist_transpiler.py | 27 +- .../unittests/test_distributed_strategy.py | 3 +- .../fluid/tests/unittests/test_downpoursgd.py | 6 +- .../unittests/test_dygraph_mnist_fp16.py | 3 +- .../unittests/test_dygraph_multi_forward.py | 4 +- .../fluid/tests/unittests/test_dyn_rnn.py | 8 +- .../unittests/test_dynrnn_gradient_check.py | 5 +- .../unittests/test_dynrnn_static_input.py | 2 +- .../test_eager_deletion_delete_vars.py | 2 +- .../unittests/test_eager_deletion_gru_net.py | 3 +- .../unittests/test_eager_deletion_lstm_net.py | 3 +- .../test_eager_deletion_recurrent_op.py | 14 +- .../unittests/test_eager_deletion_while_op.py | 2 +- .../paddle/fluid/tests/unittests/test_ema.py | 3 +- .../test_embedding_id_stop_gradient.py | 2 +- .../fluid/tests/unittests/test_exception.py | 2 +- .../unittests/test_executor_check_feed.py | 3 +- .../test_executor_feed_non_tensor.py | 3 +- .../test_feed_data_check_shape_type.py | 2 +- .../tests/unittests/test_fetch_unmerged.py | 2 +- .../tests/unittests/test_fleet_api_input.py | 5 +- .../fluid/tests/unittests/test_fleet_auto.py | 2 +- .../tests/unittests/test_fleet_base_2.py | 2 +- .../tests/unittests/test_fleet_base_3.py | 4 +- .../tests/unittests/test_fleet_checkpoint.py | 3 +- ...est_fleet_fp16_allreduce_meta_optimizer.py | 2 +- ...st_fleet_graph_execution_meta_optimizer.py | 8 +- .../unittests/test_fleet_graph_executor.py | 2 +- .../test_fleet_hybrid_meta_optimizer.py | 50 ++-- .../test_fleet_lamb_meta_optimizer.py | 4 +- .../test_fleet_lars_meta_optimizer.py | 4 +- .../test_fleet_meta_optimizer_base.py | 2 +- .../test_fleet_pipeline_meta_optimizer.py | 2 +- ..._pipeline_meta_optimizer_with_recompute.py | 2 +- .../test_fleet_raw_program_meta_optimizer.py | 2 +- .../unittests/test_fleet_rolemaker_new.py | 2 +- .../test_fleet_sharding_meta_optimizer.py | 282 +++++++++--------- .../tests/unittests/test_fuse_bn_act_pass.py | 2 +- .../unittests/test_fuse_bn_add_act_pass.py | 4 +- .../test_fuse_relu_depthwise_conv_pass.py | 2 +- .../unittests/test_generator_dataloader.py | 2 +- .../tests/unittests/test_gradient_clip.py | 4 +- .../test_imperative_auto_mixed_precision.py | 10 +- ...perative_auto_mixed_precision_for_eager.py | 10 +- .../unittests/test_imperative_auto_prune.py | 9 +- .../tests/unittests/test_imperative_mnist.py | 4 +- .../test_imperative_mnist_sorted_gradient.py | 4 +- .../tests/unittests/test_imperative_resnet.py | 4 +- .../test_imperative_resnet_sorted_gradient.py | 4 +- .../unittests/test_imperative_se_resnext.py | 4 +- ...perative_star_gan_with_gradient_penalty.py | 6 +- .../test_imperative_static_runner_mnist.py | 4 +- .../test_imperative_static_runner_while.py | 6 +- .../unittests/test_inference_model_io.py | 14 +- .../tests/unittests/test_ir_inplace_pass.py | 3 +- .../test_ir_memory_optimize_ifelse_op.py | 2 +- .../unittests/test_ir_memory_optimize_nlp.py | 4 +- .../unittests/test_ir_memory_optimize_pass.py | 4 +- .../tests/unittests/test_jit_save_load.py | 16 +- .../fluid/tests/unittests/test_lambv2_op.py | 2 +- .../fluid/tests/unittests/test_layers.py | 10 +- .../unittests/test_listen_and_serv_op.py | 4 +- .../test_load_state_dict_from_old_format.py | 2 +- .../unittests/test_lod_tensor_array_ops.py | 3 +- .../fluid/tests/unittests/test_lookahead.py | 2 +- .../unittests/test_lookup_table_v2_op.py | 2 +- .../fluid/tests/unittests/test_mean_op.py | 8 +- .../tests/unittests/test_memory_usage.py | 2 +- .../test_mix_precision_all_reduce_fuse.py | 2 +- .../tests/unittests/test_modelaverage.py | 2 +- .../fluid/tests/unittests/test_momentum_op.py | 6 +- .../paddle/fluid/tests/unittests/test_nce.py | 4 +- .../unittests/test_network_with_dtype.py | 2 +- .../tests/unittests/test_nn_sigmoid_op.py | 1 + .../tests/unittests/test_optimizer_grad.py | 2 +- .../test_optimizer_in_control_flow.py | 14 +- .../unittests/test_parallel_executor_crf.py | 2 +- .../test_parallel_executor_drop_scope.py | 3 +- .../test_parallel_executor_dry_run.py | 3 +- .../test_parallel_executor_fetch_feed.py | 3 +- .../unittests/test_parallel_executor_mnist.py | 4 +- .../fluid/tests/unittests/test_profiler.py | 2 +- .../fluid/tests/unittests/test_program.py | 7 +- .../unittests/test_program_prune_backward.py | 12 +- .../tests/unittests/test_prroi_pool_op.py | 5 +- .../fluid/tests/unittests/test_prune.py | 53 ++-- .../unittests/test_pull_gpups_sparse_op.py | 2 +- .../fluid/tests/unittests/test_py_func_op.py | 2 +- .../test_py_reader_using_executor.py | 2 +- .../tests/unittests/test_recurrent_op.py | 13 +- .../fluid/tests/unittests/test_registry.py | 3 +- .../fluid/tests/unittests/test_regularizer.py | 2 +- .../tests/unittests/test_regularizer_api.py | 2 +- .../fluid/tests/unittests/test_rmsprop_op.py | 2 +- .../unittests/test_select_input_output_op.py | 3 +- .../fluid/tests/unittests/test_sgd_op.py | 6 +- .../tests/unittests/test_shrink_rnn_memory.py | 3 +- .../test_split_and_merge_lod_tensor_op.py | 3 +- .../fluid/tests/unittests/test_trainable.py | 3 +- .../tests/unittests/test_weight_decay.py | 2 +- .../fluid/tests/unittests/test_where_op.py | 2 +- .../tests/unittests/test_while_loop_op.py | 6 +- .../fluid/tests/unittests/test_while_op.py | 2 +- .../tests/unittests/xpu/test_assign_op_xpu.py | 2 +- .../tests/unittests/xpu/test_mean_op_xpu.py | 4 +- .../unittests/xpu/test_rmsprop_op_xpu.py | 2 +- .../tests/unittests/xpu/test_sgd_op_xpu.py | 2 +- .../tests/unittests/xpu/test_where_op_xpu.py | 2 +- 244 files changed, 662 insertions(+), 612 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py index 36302aea187af..d69241d6cb982 100644 --- a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py +++ b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py @@ -68,7 +68,7 @@ def train_lenet(lenet, reader, optimizer): out = lenet(img) loss = fluid.layers.cross_entropy(out, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) avg_loss.backward() optimizer.minimize(avg_loss) diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py index d8887e1964128..1102ddb0074a2 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_graph.py +++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py @@ -46,7 +46,7 @@ def conv_block(): act="relu") prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return [img, label], avg_loss diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index 0bb246f9ac923..2c18eff983e4c 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -118,7 +118,7 @@ def func_qat(self): out = lenet(img) acc = fluid.layers.accuracy(out, label) loss = fluid.layers.cross_entropy(out, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) avg_loss.backward() adam.minimize(avg_loss) lenet.clear_gradients() diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py index e40816f39545a..6a3e35007dd46 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py @@ -115,7 +115,7 @@ def model_train(self, model, batch_num=-1, batch_size=32, use_amp=False): out = model(img) acc = fluid.layers.accuracy(out, label) loss = fluid.layers.cross_entropy(out, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) scaled_loss = scaler.scale(avg_loss) scaled_loss.backward() @@ -125,7 +125,7 @@ def model_train(self, model, batch_num=-1, batch_size=32, use_amp=False): out = model(img) acc = fluid.layers.accuracy(out, label) loss = fluid.layers.cross_entropy(out, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) avg_loss.backward() adam.minimize(avg_loss) diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py index 28706d34c63fd..fbb1adefa1111 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py @@ -45,7 +45,7 @@ def conv_net(img, label): act="relu") prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py index c42777d673a7d..ce06bd63a8628 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py @@ -41,7 +41,7 @@ def linear_fc(num): for _ in six.moves.xrange(num): hidden = fluid.layers.fc(hidden, size=128, act='relu') loss = fluid.layers.cross_entropy(input=hidden, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss @@ -92,7 +92,7 @@ def conv_bn_layer(input, pool_stride=2) fc = fluid.layers.fc(input=pool, size=10) loss = fluid.layers.cross_entropy(input=fc, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss @@ -116,7 +116,7 @@ def conv_net(img, label, quant_skip_pattern): with fluid.name_scope(quant_skip_pattern): prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss @@ -620,7 +620,7 @@ def conv_bn_layer(input, pool_add = fluid.layers.elementwise_add(x=pool1, y=pool2, act='relu') fc = fluid.layers.fc(input=pool_add, size=10) loss = fluid.layers.cross_entropy(input=fc, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py index 0f4c450cfa98d..2e78d4ea8cba3 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py @@ -53,7 +53,7 @@ def conv_net(img, label): hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu') prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py index 80fe720504efd..d2a5383024338 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantize_transpiler_v2.py @@ -48,7 +48,7 @@ def conv_net(img, label): hidden = fluid.layers.fc(input=conv_pool_1, size=100, act='relu') prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py index ec9ab8820a613..9ee4f3681588d 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py +++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py @@ -55,7 +55,7 @@ def conv_net(img, label): hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu') prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py index 60dfde6b45c37..35787c02eef3e 100644 --- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py +++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py @@ -136,7 +136,7 @@ def train(net_type, use_cuda, save_dirname, is_local): logits = fluid.layers.fc(input=net, size=classdim, act="softmax") cost, predict = fluid.layers.softmax_with_cross_entropy( logits, label, return_softmax=True) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) acc = fluid.layers.accuracy(input=predict, label=label) # Test program @@ -460,7 +460,7 @@ def decorate_with_data_loader(self): logits = fluid.layers.fc(input=net, size=10, act="softmax") cost, predict = fluid.layers.softmax_with_cross_entropy( logits, label, return_softmax=True) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) optimizer = fluid.optimizer.Lamb(learning_rate=0.001) amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists( diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py index dd900ff428135..a7472f3bce5ac 100644 --- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py +++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py @@ -32,7 +32,7 @@ def linear_fc(num): for _ in six.moves.xrange(num): hidden = fluid.layers.fc(hidden, size=128, act='relu') loss = fluid.layers.cross_entropy(input=hidden, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss @@ -63,7 +63,7 @@ def conv_bn_layer(input, hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu') fc = fluid.layers.fc(input=hidden, size=10) loss = fluid.layers.cross_entropy(input=fc, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss @@ -83,7 +83,7 @@ def conv_net(img, label): act="relu") prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py index bbc61d34613da..4bb1ed72b7b9f 100644 --- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py +++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py @@ -87,7 +87,7 @@ def bow_net(data, fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index e8c263fe03355..a55bcb9aaaba6 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -103,7 +103,7 @@ def _dygraph_to_static_func_(dygraph_func): @dygraph_to_static_func def func(x): - if fluid.layers.mean(x) < 0: + if paddle.mean(x) < 0: x_v = x - 1 else: x_v = x + 1 diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index 4b9c50127f046..18950144bc4d4 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -897,7 +897,7 @@ def step(self, loss): check_type(loss, 'loss', Variable, 'ReduceLROnPlateau.step') assert len(loss.shape) == 1 and loss.shape[0] == 1, "the loss.shape " \ "should be (1L,), but the current loss.shape is {}. Maybe that " \ - "you should call fluid.layers.mean to process it first.".format(loss.shape) + "you should call paddle.mean to process it first.".format(loss.shape) self.epoch_num += 1 if self.cooldown_counter > 0: diff --git a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py index 806de1e6da900..1b763c6ed5952 100644 --- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py +++ b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py @@ -16,6 +16,7 @@ import logging import time +import paddle import paddle.fluid as fluid import paddle.fluid.incubate.fleet.base.role_maker as role_maker from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet @@ -123,7 +124,7 @@ def model(): auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return datas, avg_cost, predict, train_file_path diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py index 0c621766b3794..98d7fa6a037a6 100644 --- a/python/paddle/fluid/install_check.py +++ b/python/paddle/fluid/install_check.py @@ -102,7 +102,7 @@ def test_parallerl_exe(): exe = executor.Executor( core.CUDAPlace(0) if core.is_compiled_with_cuda() and (core.get_cuda_device_count() > 0) else core.CPUPlace()) - loss = layers.mean(out) + loss = paddle.mean(out) loss.persistable = True optimizer.SGD(learning_rate=0.01).minimize(loss) startup_prog.random_seed = 1 diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 7a39c2bc2fbc0..db88331040fa7 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -493,7 +493,7 @@ def save_params(executor, dirname, main_program=None, filename=None): predict = fluid.layers.fc(input=image, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=predict, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) @@ -719,7 +719,7 @@ def save_persistables(executor, dirname, main_program=None, filename=None): predict = fluid.layers.fc(input=image, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=predict, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) fluid.io.save_persistables(executor=exe, dirname=dir_path, filename=file_name) @@ -1315,7 +1315,7 @@ def save_inference_model(dirname, predict = fluid.layers.fc(input=image, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=predict, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) exe = fluid.Executor(fluid.CPUPlace()) exe.run(fluid.default_startup_program()) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d7f0feb103c5f..050d6bfcb6bbb 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -13089,7 +13089,7 @@ def mean(x, name=None): input = fluid.layers.data( name='data', shape=[2, 3], dtype='float32') - mean = fluid.layers.mean(input) + mean = paddle.mean(input) """ if _in_legacy_dygraph(): diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 315382262a0f3..c97809a069d5c 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -913,7 +913,7 @@ def backward(self, program = loss.block.program assert len(loss.shape) == 1 and loss.shape[0] == 1, \ "The loss.shape should be (1L,), but the current loss.shape is {}. " \ - "Maybe that you should call fluid.layers.mean to process the current loss.".format( + "Maybe that you should call paddle.mean to process the current loss.".format( loss.shape) parameter_list = parameter_list if parameter_list \ else self._parameter_list @@ -6834,7 +6834,7 @@ class LookaheadOptimizer(object): label = fluid.layers.data(name="label", shape=[1], dtype="int64") y = fluid.layers.fc(input=[x], size=2, act="softmax") loss = fluid.layers.cross_entropy(input=y, label=label) - loss = fluid.layers.mean(x=loss) + loss = paddle.mean(x=loss) sgd = fluid.optimizer.SGD(learning_rate=0.01) optimizer = fluid.optimizer.LookaheadOptimizer(sgd, alpha=0.5, diff --git a/python/paddle/fluid/tests/book/notest_understand_sentiment.py b/python/paddle/fluid/tests/book/notest_understand_sentiment.py index d96e640f77a96..941ff43ab7d69 100644 --- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py +++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py @@ -48,7 +48,7 @@ def convolution_net(data, size=class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) accuracy = fluid.layers.accuracy(input=prediction, label=label) return avg_cost, accuracy, prediction @@ -93,7 +93,7 @@ def gate_common(ipt, hidden, size): last = fluid.layers.sequence_last_step(rnn()) prediction = fluid.layers.fc(input=last, size=class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) accuracy = fluid.layers.accuracy(input=prediction, label=label) return avg_cost, accuracy, prediction @@ -132,7 +132,7 @@ def stacked_lstm_net(data, size=class_dim, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) accuracy = fluid.layers.accuracy(input=prediction, label=label) return avg_cost, accuracy, prediction diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py index 71ba7f0c79ec9..62aaefedde780 100644 --- a/python/paddle/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/test_fit_a_line.py @@ -56,16 +56,16 @@ def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16): with amp.bf16.bf16_guard(): y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) else: y_predict = fluid.layers.fc(input=x, size=1, act=None) with amp.bf16.bf16_guard(): cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) else: y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) lr = 5e-3 if use_bf16 else 1e-3 sgd_optimizer = fluid.optimizer.SGD(learning_rate=lr) diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py index e2f78a0f36f7b..0b31a62e8e8f7 100644 --- a/python/paddle/fluid/tests/book/test_image_classification.py +++ b/python/paddle/fluid/tests/book/test_image_classification.py @@ -126,7 +126,7 @@ def train(net_type, use_cuda, save_dirname, is_local): predict = fluid.layers.fc(input=net, size=classdim, act='softmax') cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) acc = fluid.layers.accuracy(input=predict, label=label) # Test program diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index cb962493e7ac8..c8ffe0cb49cb6 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -160,7 +160,7 @@ def train(use_cuda, save_dirname=None, is_local=True): param_attr=fluid.ParamAttr( name='crfw', learning_rate=mix_hidden_lr)) - avg_cost = fluid.layers.mean(crf_cost) + avg_cost = paddle.mean(crf_cost) # TODO(qiao) # check other optimizers and check why out will be NAN diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 5301f9aa7607c..e81061f665477 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -34,7 +34,7 @@ def loss_net(hidden, label): prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) acc = fluid.layers.accuracy(input=prediction, label=label) return prediction, avg_loss, acc diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py index 0a26a03eb878b..048bfac344e79 100644 --- a/python/paddle/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/fluid/tests/book/test_recommender_system.py @@ -153,7 +153,7 @@ def model(): label = layers.data(name='score', shape=[1], dtype='float32') square_cost = layers.square_error_cost(input=scale_infer, label=label) - avg_cost = layers.mean(square_cost) + avg_cost = paddle.mean(square_cost) return scale_infer, avg_cost diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py index 9499583c07bae..694ed70c04dea 100644 --- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py +++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py @@ -158,7 +158,7 @@ def seq_to_seq_net(): dtype='int64', lod_level=1) cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost, prediction diff --git a/python/paddle/fluid/tests/book/test_word2vec_book.py b/python/paddle/fluid/tests/book/test_word2vec_book.py index 9e79fd3f523f8..b1325abea01b6 100644 --- a/python/paddle/fluid/tests/book/test_word2vec_book.py +++ b/python/paddle/fluid/tests/book/test_word2vec_book.py @@ -85,7 +85,7 @@ def __network__(words): size=dict_size, act='softmax') cost = fluid.layers.cross_entropy(input=predict_word, label=words[4]) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost, predict_word word_dict = paddle.dataset.imikolov.build_dict() diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py index f37090f67e257..887180d3f01fd 100644 --- a/python/paddle/fluid/tests/test_beam_search_decoder.py +++ b/python/paddle/fluid/tests/test_beam_search_decoder.py @@ -145,7 +145,7 @@ def train_main(use_cuda): dtype='int64', lod_level=1) cost = layers.cross_entropy(input=rnn_out, label=label) - avg_cost = layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py index e3b20c323929a..68380f8187083 100644 --- a/python/paddle/fluid/tests/test_error_clip.py +++ b/python/paddle/fluid/tests/test_error_clip.py @@ -35,7 +35,7 @@ label = fluid.layers.data(name='y', shape=[1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) prog_clip = prog.clone() prog_clip.block(0).var(hidden1.name)._set_error_clip( diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py index 12d33d1c724df..4e140032dd8dc 100644 --- a/python/paddle/fluid/tests/test_if_else_op.py +++ b/python/paddle/fluid/tests/test_if_else_op.py @@ -66,7 +66,7 @@ def not_test_raw_api(self): mask=cond, x=image) loss = layers.cross_entropy(input=prob, label=label) - avg_loss = layers.mean(loss) + avg_loss = paddle.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) @@ -124,7 +124,7 @@ def not_test_ifelse(self): prob = ie() loss = layers.cross_entropy(input=prob[0], label=label) - avg_loss = layers.mean(loss) + avg_loss = paddle.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) diff --git a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py index 1b387c081208d..0e24e94d456b0 100644 --- a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py +++ b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py @@ -61,7 +61,7 @@ def run_inference_pruning_test(self, get_mask_gen_func, def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func): with fluid.program_guard(self.main_program, self.startup_program): - loss = fluid.layers.mean( + loss = paddle.mean( fluid.layers.cross_entropy(input=self.predict, label=self.label)) optimizer = paddle.incubate.asp.decorate( diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py index 27b4361852f6a..e35430b046ac5 100644 --- a/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py +++ b/python/paddle/fluid/tests/unittests/asp/test_asp_customized_pruning.py @@ -242,7 +242,7 @@ def test_inference_pruning(self): def test_training_pruning(self): with fluid.program_guard(self.main_program, self.startup_program): - loss = fluid.layers.mean( + loss = paddle.mean( fluid.layers.cross_entropy(input=self.predict, label=self.label)) optimizer = sparsity.decorate( diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py index c6b3d00cac249..8770d4cb3b575 100644 --- a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py +++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize_static.py @@ -48,7 +48,7 @@ def build_model(): with fluid.program_guard(self.main_program, self.startup_program): self.img, self.label, predict = build_model() - self.loss = fluid.layers.mean( + self.loss = paddle.mean( fluid.layers.cross_entropy(input=predict, label=self.label)) self.optimizer = fluid.optimizer.SGD(learning_rate=0.01) diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py index 1bb5c1477b29b..4796bf364a207 100644 --- a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py +++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_static.py @@ -66,7 +66,7 @@ def test_inference_pruning(self): def test_training_pruning(self): with fluid.program_guard(self.main_program, self.startup_program): - loss = fluid.layers.mean( + loss = paddle.mean( fluid.layers.cross_entropy(input=self.predict, label=self.label)) optimizer = paddle.incubate.asp.decorate( diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py index dc5316d254fd5..66543514e5369 100644 --- a/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py +++ b/python/paddle/fluid/tests/unittests/asp/test_asp_save_load.py @@ -141,7 +141,7 @@ def build_model(): with fluid.program_guard(self.main_program, self.startup_program): self.img, self.label, predict = build_model() - self.loss = fluid.layers.mean( + self.loss = paddle.mean( fluid.layers.cross_entropy(input=predict, label=self.label)) self.optimizer = fluid.optimizer.SGD(learning_rate=0.01) self.optimizer = paddle.incubate.asp.decorate(self.optimizer) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py index 7d738d3678926..688a31b78de00 100755 --- a/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel_parallelizer.py @@ -96,7 +96,7 @@ def mlp_pretrain_forward(train_program, start_program): predict = mlp(input) cost = layers.cross_entropy(input=predict, label=label) - avg_cost = layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost, train_program, start_program diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py index d188ae6654509..69cd4a1b55411 100644 --- a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py +++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py @@ -70,7 +70,7 @@ def net(): cost, y_predict = fluid.layers.softmax_with_cross_entropy( hidden, y, return_softmax=True) acc_top1 = fluid.layers.accuracy(input=y_predict, label=y, k=1) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.05) sgd_optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py index 1360d975603b2..9f7f411be5b21 100644 --- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py +++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py @@ -84,7 +84,7 @@ def get_model(self, batch_size=2, single_device=False): # Train program predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py index 6cd452ed1952a..0e811cb050bcc 100644 --- a/python/paddle/fluid/tests/unittests/dist_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_ctr.py @@ -92,7 +92,7 @@ def get_model(self, batch_size=2): auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) inference_program = paddle.fluid.default_main_program().clone() diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py index 9508dc6c26292..a33624ee5eedf 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py @@ -143,7 +143,7 @@ def net(self, args, is_train=True, batch_size=4, lr=0.01): label=label) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) self.feeds = datas self.train_file_path = ["fake1", "fake2"] diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py index f714526286c92..dc0feb35ae8d1 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_pipeline_ctr.py @@ -116,7 +116,7 @@ def net(self, args, batch_size=4, lr=0.01): predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax') cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) fluid.layers.Print(avg_cost, message="avg_cost") self.feeds = datas diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py index 19e278b4f4620..50a2089cd177e 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py @@ -76,7 +76,7 @@ def get_model(self, batch_size=2, single_device=False): # Train program predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py index cab4484d3e49c..003de5458786e 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer_fuse_allreduce.py @@ -76,7 +76,7 @@ def get_model(self, batch_size=2, single_device=False): # Train program predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py index 4a43fb44f46f7..66647b52bb500 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py @@ -91,7 +91,7 @@ def get_loss(cos_q_pt, cos_q_nt): shape=[-1, 1], value=0.0, dtype='float32'), loss_op2) - avg_cost = fluid.layers.mean(loss_op3) + avg_cost = paddle.mean(loss_op3) return avg_cost diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py index 60b8a7bb6fdff..17589f7f93bb1 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_sparse_embedding_ctr.py @@ -133,7 +133,7 @@ def net(self, args, batch_size=4, lr=0.01): acc = fluid.layers.accuracy(input=predict, label=label) auc_var, _, _ = fluid.layers.auc(input=predict, label=label) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) self.feeds = datas self.train_file_path = ["fake1", "fake2"] diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index cdfec08f9fe7a..0d1e826c1f559 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -85,7 +85,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): # Train program predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py index ca59e33ec9e12..c3d7a4f9a56f4 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py @@ -58,7 +58,7 @@ def get_model(self, batch_size=2): # Train program predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py index b78dd744a9ae1..034bcbdb9a04a 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist_fp16_allreduce.py @@ -38,7 +38,7 @@ def get_model(self, batch_size=2): # Train program predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py b/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py index 50a053f57b801..75d9bd806c921 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist_gradient_merge.py @@ -37,7 +37,7 @@ def get_model(self, batch_size=2): # Train program predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py index 31362565c8981..9de09d0ff6ce4 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist_lars.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist_lars.py @@ -49,7 +49,7 @@ def get_model(self, batch_size=2): # Train program predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index ad5d632637ebb..eb4b41aff91ec 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -229,7 +229,7 @@ def get_model(self, batch_size=2, use_dgc=False): out = model.net(input=image, class_dim=102) cost = fluid.layers.cross_entropy(input=out, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) diff --git a/python/paddle/fluid/tests/unittests/dist_sharding_save.py b/python/paddle/fluid/tests/unittests/dist_sharding_save.py index e31901c8c85b9..e989374e2af46 100755 --- a/python/paddle/fluid/tests/unittests/dist_sharding_save.py +++ b/python/paddle/fluid/tests/unittests/dist_sharding_save.py @@ -56,7 +56,7 @@ def runtime_main(): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.sharding = True diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py index ede62e643d2e6..08a96575617cf 100644 --- a/python/paddle/fluid/tests/unittests/dist_text_classification.py +++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py @@ -137,7 +137,7 @@ def get_model(self, batch_size=2): # Train program predict = conv_net(data, dict_dim) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) acc = fluid.layers.accuracy(input=predict, label=label) inference_program = fluid.default_main_program().clone() diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py index 744a6d6729a71..06bd017612204 100644 --- a/python/paddle/fluid/tests/unittests/dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py @@ -94,7 +94,7 @@ def __network__(words): initializer=fluid.initializer.Constant(value=0.1))) cost = fluid.layers.cross_entropy(input=predict_word, label=words[4]) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost, predict_word word_dict = paddle.dataset.imikolov.build_dict() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py index 7ee6203fb9433..68b3962dde9a6 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import absolute_import, division, print_function - +import paddle import paddle.fluid as fluid from paddle.fluid.dygraph import Embedding, Layer, Linear from paddle.fluid.dygraph.jit import declarative @@ -357,7 +357,7 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask, mask_lm_loss = fluid.layers.softmax_with_cross_entropy(logits=fc_out, label=mask_label) - mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) + mean_mask_lm_loss = paddle.mean(mask_lm_loss) next_sent_fc_out = self.next_sent_fc(next_sent_feat) @@ -367,7 +367,7 @@ def forward(self, src_ids, position_ids, sentence_ids, input_mask, next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels) - mean_next_sent_loss = fluid.layers.mean(next_sent_loss) + mean_next_sent_loss = paddle.mean(next_sent_loss) loss = mean_next_sent_loss + mean_mask_lm_loss return next_sent_acc, mean_mask_lm_loss, loss diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py index 34264cac8a1be..d0d024fb78624 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py @@ -41,7 +41,7 @@ def dyfunc_empty_nonlocal(x): def dyfunc_with_if_else(x_v, label=None): - if fluid.layers.mean(x_v).numpy()[0] > 5: + if paddle.mean(x_v).numpy()[0] > 5: x_v = x_v - 1 else: x_v = x_v + 1 @@ -87,14 +87,14 @@ def false_fn_0(q, x, y): m = x + 2 n = x + 3 return q, x, y, z - q, x, y, z = fluid.layers.cond(fluid.layers.mean(x)[0] < 5, lambda : + q, x, y, z = fluid.layers.cond(paddle.mean(x)[0] < 5, lambda : paddle.jit.dy2static.convert_call(true_fn_0)(q, x, y), lambda : paddle.jit.dy2static.convert_call(false_fn_0)(q, x, y)) """ y = x + 1 # NOTE: x_v[0] < 5 is True - if fluid.layers.mean(x).numpy()[0] < 5: + if paddle.mean(x).numpy()[0] < 5: x = x + 1 z = x + 2 q = x + 3 @@ -155,13 +155,13 @@ def nested_if_else(x_v): batch_size = fluid.layers.shape(x_v)[0] # if tensor.shape is [1], now support to compare with numpy. - if fluid.layers.mean(x_v).numpy() < 0: + if paddle.mean(x_v).numpy() < 0: y = x_v + bias w = fluid.layers.fill_constant([feat_size], dtype='float32', value=10) if y.numpy()[0] < 10: tmp = y * w y = fluid.layers.relu(tmp) - if fluid.layers.mean(y).numpy()[0] < batch_size: + if paddle.mean(y).numpy()[0] < batch_size: y = fluid.layers.abs(y) else: tmp = fluid.layers.fill_constant([feat_size], @@ -257,7 +257,7 @@ def forward(self, input): value=1) # Control flow `if` statement fc_out = self.fc(input) - if fluid.layers.mean(fc_out).numpy()[0] < 0: + if paddle.mean(fc_out).numpy()[0] < 0: y = fc_out + self.constant_vars['bias'] self.constant_vars['w'] = fluid.layers.fill_constant( [5], dtype='float32', value=10) @@ -280,13 +280,13 @@ def forward(self, input): else: y = fc_out - self.constant_vars['bias'] - loss = fluid.layers.mean(y) + loss = paddle.mean(y) return loss def if_with_and_or(x_v, label=None): batch_size = fluid.layers.shape(x_v) - if x_v is not None and (fluid.layers.mean(x_v).numpy()[0] > 0 or label + if x_v is not None and (paddle.mean(x_v).numpy()[0] > 0 or label is not None) and batch_size[0] > 1 and True: x_v = x_v - 1 else: @@ -318,7 +318,7 @@ def if_with_and_or_2(x, y=None): def if_with_and_or_3(x, y=None): batch_size = fluid.layers.shape(x) - mean_res = fluid.layers.mean(x) + mean_res = paddle.mean(x) if x is not None and batch_size[0] > 1 and y is not None and mean_res.numpy( )[0] > 0: x = x + 1 @@ -329,7 +329,7 @@ def if_with_and_or_3(x, y=None): def if_with_and_or_4(x, y=None): batch_size = fluid.layers.shape(x) - mean_res = fluid.layers.mean(x) + mean_res = paddle.mean(x) if (x is not None and batch_size[0] > 1) or (y is not None and mean_res.numpy()[0] > 0): x = x + 1 @@ -349,7 +349,7 @@ def __init__(self): foo = Foo() batch_size = fluid.layers.shape(x) - mean_res = fluid.layers.mean(x) + mean_res = paddle.mean(x) if batch_size[0] > foo.a: x = x + foo.b @@ -361,7 +361,7 @@ def __init__(self): def if_tensor_case(x): x = fluid.dygraph.to_variable(x) - mean = fluid.layers.mean(x) + mean = paddle.mean(x) # It is equivalent to `if mean != 0` if mean: for i in range(0, 10): @@ -376,7 +376,7 @@ def if_tensor_case(x): x += i # join `and`/`or` - if fluid.layers.mean(x) + 1 and mean > 1 and x is not None or 2 > 1: + if paddle.mean(x) + 1 and mean > 1 and x is not None or 2 > 1: x -= 1 # `not` statement diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py index 00eb25792b2d2..75374cc4db797 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py @@ -19,6 +19,7 @@ from paddle.utils import gast import inspect import numpy as np +import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_func @@ -59,7 +60,7 @@ def test_ast2func_static(self): def func(x): y = fluid.layers.relu(x) - loss = fluid.layers.mean(y) + loss = paddle.mean(y) return loss x_data = np.random.random([10, 16]).astype('float32') diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py index 14683b33feb37..f240fb9e5c112 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py @@ -590,7 +590,7 @@ def val_bmn(model, args): loss, tem_loss, pem_reg_loss, pem_cls_loss = bmn_loss_func( pred_bm, pred_start, pred_end, gt_iou_map, gt_start, gt_end, args) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) loss_data += [ avg_loss.numpy()[0], @@ -665,7 +665,7 @@ def train_bmn(self, args, place, to_static): loss, tem_loss, pem_reg_loss, pem_cls_loss = bmn_loss_func( pred_bm, pred_start, pred_end, gt_iou_map, gt_start, gt_end, args) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) avg_loss.backward() adam.minimize(avg_loss) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py index 3d2339f58f387..68e725d7fc5f8 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cache_program.py @@ -17,7 +17,7 @@ import unittest import numpy as np from collections import Counter - +import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.jit import declarative @@ -113,7 +113,7 @@ def test_with_optimizer(self): def simple_func(x): inputs = fluid.dygraph.to_variable(x) - mean = fluid.layers.mean(inputs) + mean = paddle.mean(inputs) return mean diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py index 136d2d37db800..3c1f31d0638b9 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py @@ -37,7 +37,7 @@ # Use a decorator to test exception @paddle.jit.to_static def dyfunc_with_if(x_v): - if fluid.layers.mean(x_v).numpy()[0] > 5: + if paddle.mean(x_v).numpy()[0] > 5: x_v = x_v - 1 else: x_v = x_v + 1 @@ -58,7 +58,7 @@ def fn1(): @paddle.jit.to_static def dyfunc_with_third_library_logging(x_v): logging.info('test dyfunc_with_third_library_logging') - if fluid.layers.mean(x_v).numpy()[0] > 5: + if paddle.mean(x_v).numpy()[0] > 5: x_v = x_v - 1 else: x_v = x_v + 1 diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py index 8058234cb5f96..7d980b5f75a62 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py @@ -33,7 +33,7 @@ def inner_func(): def func_error_in_compile_time(x): x = fluid.dygraph.to_variable(x) inner_func() - if fluid.layers.mean(x) < 0: + if paddle.mean(x) < 0: x_v = x - 1 else: x_v = x + 1 @@ -78,7 +78,7 @@ def __init__(self, fc_size=20): def forward(self, x): y = self._linear(x) z = fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int") - out = fluid.layers.mean(y[z]) + out = paddle.mean(y[z]) return out @@ -386,7 +386,7 @@ def set_message(self): 'y = self._linear(x)', 'z = fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")', '<--- HERE', - 'out = fluid.layers.mean(y[z])', + 'out = paddle.mean(y[z])', 'return out' ] diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py index 555e71ce9a0ca..d3654260d8d77 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_fetch_feed.py @@ -16,7 +16,7 @@ import numpy as np import unittest - +import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.jit import declarative from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator @@ -59,7 +59,7 @@ def __init__(self, input_dim=10, output_dim=5): @declarative def forward(self, x): pre = self.fc(x) - loss = fluid.layers.mean(pre) + loss = paddle.mean(pre) return pre, loss diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py index 33b50af7c6dcf..108c6228499e0 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_full_name_usage.py @@ -15,6 +15,7 @@ from __future__ import print_function import numpy as np +import paddle import paddle.fluid as fluid import unittest from paddle.fluid.dygraph import declarative @@ -23,7 +24,7 @@ @fluid.dygraph.declarative def dygraph_decorated_func(x): x = fluid.dygraph.to_variable(x) - if fluid.layers.mean(x) > 0: + if paddle.mean(x) > 0: x_v = x - 1 else: x_v = x + 1 @@ -33,7 +34,7 @@ def dygraph_decorated_func(x): @fluid.dygraph.declarative def jit_decorated_func(x): x = fluid.dygraph.to_variable(x) - if fluid.layers.mean(x) > 0: + if paddle.mean(x) > 0: x_v = x - 1 else: x_v = x + 1 diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py index 822835a8c7cd1..1f1624280a023 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py @@ -251,7 +251,7 @@ def relu(x): def call_external_func(x, label=None): - if fluid.layers.mean(x) < 0: + if paddle.mean(x) < 0: x_v = x - 1 else: x_v = add_fn(x) @@ -274,7 +274,7 @@ class NetWithExternalFunc(fluid.dygraph.Layer): @declarative def forward(self, x, label=None): - if fluid.layers.mean(x) < 0: + if paddle.mean(x) < 0: x_v = x - 1 else: x_v = add_fn(x) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py index ddda462525f31..0c41621f6e719 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py @@ -403,7 +403,7 @@ def forward(self, word, target, length=None): crf_cost = self.linear_chain_crf(input=emission, label=target, length=length) - avg_cost = fluid.layers.mean(x=crf_cost) + avg_cost = paddle.mean(x=crf_cost) crf_decode = self.crf_decoding(input=emission, length=length) return avg_cost, crf_decode diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py index 7eccbedf4d219..8254a6d24b534 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lambda.py @@ -16,6 +16,7 @@ import numpy as np import unittest +import paddle import paddle.fluid as fluid from paddle.fluid.dygraph import declarative @@ -25,7 +26,7 @@ def call_lambda_as_func(x): x = fluid.dygraph.to_variable(x) add_func = lambda x, y: x + y - mean_func = lambda x: fluid.layers.mean(x) + mean_func = lambda x: paddle.mean(x) y = add_func(x, 1) y = add_func(y, add_func(y, -1)) @@ -38,7 +39,7 @@ def call_lambda_directly(x): x = fluid.dygraph.to_variable(x) y = (lambda x, y: x + y)(x, x) - out = (lambda x: fluid.layers.mean(x))(y) + out = (lambda x: paddle.mean(x))(y) return out @@ -48,7 +49,7 @@ def call_lambda_in_func(x): add_func = lambda x: x + 1 - y = fluid.layers.mean((lambda x: fluid.layers.relu(x))(x)) + y = paddle.mean((lambda x: fluid.layers.relu(x))(x)) out = add_func(y) if y > 1 and y < 2 else (lambda x: x**2)(y) return out @@ -59,7 +60,7 @@ def call_lambda_with_ifExpr(x): add_func = lambda x: x + 1 - y = fluid.layers.mean(x) + y = paddle.mean(x) out = add_func(y) if y or y < 2 else (lambda x: x**2)(y) return out @@ -70,7 +71,7 @@ def call_lambda_with_ifExpr2(x): add_func = lambda x: x + 1 - y = fluid.layers.mean(x) + y = paddle.mean(x) # NOTE: y is Variable, but z<2 is python bool value z = 0 diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py index 35c8b4d952295..6396c093ba137 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py @@ -119,7 +119,7 @@ def forward(self, inputs, label=None): if label is not None: acc = fluid.layers.accuracy(input=x, label=label) loss = fluid.layers.cross_entropy(x, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return x, acc, avg_loss else: diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py index 18694f6cdec58..29bdddf73cbdc 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py @@ -479,7 +479,7 @@ def train_mobilenet(args, to_static): softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy(input=softmax_out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) t_start_back = time.time() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py index 8549d03f7e27b..8ecae3c6b8d3a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py @@ -33,7 +33,7 @@ def nested_input(x, y): sub_res = z_elem[0] - z_elem[1] mul_res = y[-1]['d']['da'] * y[-1]['d']['dc'] - mean_func = fluid.layers.mean + mean_func = paddle.mean out = mean_func(sub_res) + mean_func(sum_res) + mean_func(mul_res) return out diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py index 13399b63e3292..e22cee2ffeea6 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py @@ -42,7 +42,7 @@ def simple_func(x, weight_numpy): x = fluid.dygraph.to_variable(x) w = fluid.dygraph.to_variable(weight_numpy) y = fluid.layers.matmul(x, w) - z = fluid.layers.mean(y) + z = paddle.mean(y) return z @@ -51,7 +51,7 @@ def decorated_simple_func(x, weight_numpy): x = fluid.dygraph.to_variable(x) w = fluid.dygraph.to_variable(weight_numpy) y = fluid.layers.matmul(x, w) - z = fluid.layers.mean(y) + z = paddle.mean(y) return z @@ -91,7 +91,7 @@ def false_fn_0(): return x_v _jst.IfElse( - fluid.layers.mean(x_v)[0] > 5, true_fn_0, false_fn_0, get_args_0, + paddle.mean(x_v)[0] > 5, true_fn_0, false_fn_0, get_args_0, set_args_0, ('x_v', )) def get_args_1(): @@ -148,7 +148,7 @@ def false_fn_2(): return x_v _jst.IfElse( - fluid.layers.mean(x_v)[0] > 5, true_fn_2, false_fn_2, get_args_2, + paddle.mean(x_v)[0] > 5, true_fn_2, false_fn_2, get_args_2, set_args_2, ('x_v', )) def get_args_3(): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py index 553ad00a6d29b..bd1c926091c92 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py @@ -262,7 +262,7 @@ def train(self, to_static, build_strategy=None): pred = resnet(img) loss = fluid.layers.cross_entropy(input=pred, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py index cfdd7d9df51d0..2aa2d6b96901f 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_amp.py @@ -75,7 +75,7 @@ def train(to_static, build_strategy=None): # precision problem, need to figure out the underlying reason. # If we remove it, the loss between dygraph and dy2stat is exactly same. loss = fluid.layers.cross_entropy(input=pred, label=label) - avg_loss = fluid.layers.mean(x=pred) + avg_loss = paddle.mean(x=pred) acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py index fa0460f5200b2..3e301e5a6f009 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_pure_fp16.py @@ -77,7 +77,7 @@ def train(to_static, build_strategy=None): level='O2'): pred = resnet(img) loss = fluid.layers.cross_entropy(input=pred, label=label) - avg_loss = fluid.layers.mean(x=pred) + avg_loss = paddle.mean(x=pred) acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py index 6c8216dac55fa..9549844f59c05 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py @@ -45,7 +45,7 @@ def __init__(self, fc_size): def forward(self, x): y = self._linear(x) z = self._linear(y) - out = fluid.layers.mean(z) + out = paddle.mean(z) return out, y diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py index 965013adf5d8f..16e51784a07a3 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py @@ -318,7 +318,7 @@ def forward(self, inputs, label): softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy(input=softmax_out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py index 108c060fab868..719645aa2b5ce 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py @@ -97,7 +97,7 @@ def forward(self, inputs, label=None): prediction = self._fc_prediction(fc_1) cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) acc = fluid.layers.accuracy(input=prediction, label=label) return avg_cost, prediction, acc @@ -141,7 +141,7 @@ def forward(self, inputs, label=None): prediction = self._fc_prediction(fc_2) cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) acc = fluid.layers.accuracy(input=prediction, label=label) return avg_cost, prediction, acc @@ -189,7 +189,7 @@ def forward(self, inputs, label=None): prediction = self._fc_prediction(fc_2) cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) acc = fluid.layers.accuracy(input=prediction, label=label) return avg_cost, prediction, acc @@ -247,7 +247,7 @@ def forward(self, inputs, label=None): # TODO(Aurelius84): Uncomment the following codes when we support return variable-length vars. # if label is not None: cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) acc = fluid.layers.accuracy(input=prediction, label=label) return avg_cost, prediction, acc # else: diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py index 481858be6f469..15a1db65b941a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py @@ -303,7 +303,7 @@ def train(args, fake_data_reader, to_static): loss = fluid.layers.cross_entropy(input=outputs, label=labels, ignore_index=-1) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) acc_top1 = fluid.layers.accuracy(input=outputs, label=labels, k=1) diff --git a/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py b/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py index c6a39bd6d0418..3748894f4effd 100644 --- a/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py +++ b/python/paddle/fluid/tests/unittests/fleet_heter_ps_training.py @@ -101,7 +101,7 @@ def net(batch_size=4, lr=0.01): predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax') cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return datas, avg_cost diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py index fe79bae75f530..ebeeb1e272f09 100755 --- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py +++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py @@ -66,7 +66,7 @@ def net(self, main_prog, startup_prog): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() return avg_cost, strategy @@ -101,7 +101,7 @@ def fc_block(input_x): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() return avg_cost, strategy diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py index 72c2c9cc3beed..c1d144cd56443 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py @@ -48,7 +48,7 @@ def build_model(self): x = paddle.static.data(name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') - out = paddle.fluid.layers.mean(x) + out = paddle.mean(x) self.fetch_list = [out.name] def run_model(self, exec_mode): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py index f800d2fc3f4de..a726e2cd061f0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_quant_dequant_pass.py @@ -18,6 +18,7 @@ import numpy as np from inference_pass_test import InferencePassTest from quant_dequant_test import QuantDequantTest +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker @@ -54,7 +55,7 @@ def network(): cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816]) result = fluid.layers.relu(cout) loss = fluid.layers.cross_entropy(input=result, label=label_shape) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss, result self.main_program.random_seed = 2 @@ -152,7 +153,7 @@ def network(): cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816]) result = fluid.layers.relu(cout) loss = fluid.layers.cross_entropy(input=result, label=label_shape) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss, result self.main_program.random_seed = 2 @@ -245,7 +246,7 @@ def network(): cout = fluid.layers.reshape(conv_out, shape=[1, 1, 10816]) result = fluid.layers.relu(cout) loss = fluid.layers.cross_entropy(input=result, label=label_shape) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss, result self.main_program.random_seed = 2 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py index e62b6557844c9..cf1fa96c1204d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py @@ -18,6 +18,7 @@ import numpy as np from inference_pass_test import InferencePassTest from quant_dequant_test import QuantDequantTest +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import AnalysisConfig @@ -40,7 +41,7 @@ def network(): act="relu") result = fluid.layers.relu(fc_out) loss = fluid.layers.cross_entropy(input=result, label=self.label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss, result self.main_program.random_seed = 2 @@ -105,7 +106,7 @@ def network(): c_out = fluid.layers.reshape(fc_out, shape=[0, 784]) result = fluid.layers.relu(c_out) loss = fluid.layers.cross_entropy(input=result, label=self.label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss, result self.main_program.random_seed = 2 @@ -172,7 +173,7 @@ def network(): c_out = fluid.layers.reshape(fc_out, shape=[1, 1, 2744]) result = fluid.layers.relu(c_out) loss = fluid.layers.cross_entropy(input=result, label=label_shape) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss, result self.main_program.random_seed = 2 diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py index 01f65b54bd4ae..baf02fc423309 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py @@ -16,6 +16,7 @@ import numpy as np from inference_pass_test import InferencePassTest from quant_dequant_test import QuantDequantTest +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.core import PassVersionChecker @@ -44,7 +45,7 @@ def network(): act=None) result = fluid.layers.relu(fc_out) loss = fluid.layers.cross_entropy(input=result, label=self.label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss, result self.main_program.random_seed = 2 @@ -136,7 +137,7 @@ def network(): act=None) result = fluid.layers.relu(fc_out) loss = fluid.layers.cross_entropy(input=result, label=self.label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss, result self.main_program.random_seed = 2 @@ -227,7 +228,7 @@ def network(): act=None) result = fluid.layers.relu(fc_out) loss = fluid.layers.cross_entropy(input=result, label=self.label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss, result self.main_program.random_seed = 2 diff --git a/python/paddle/fluid/tests/unittests/ir/pass_test.py b/python/paddle/fluid/tests/unittests/ir/pass_test.py index 56e31aa705ff2..04e98202dd4a3 100644 --- a/python/paddle/fluid/tests/unittests/ir/pass_test.py +++ b/python/paddle/fluid/tests/unittests/ir/pass_test.py @@ -20,7 +20,7 @@ import unittest import warnings import numpy as np - +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.framework import Program, Block @@ -56,7 +56,7 @@ def grad(self, var): def append_gradients(self, outs): with fluid.program_guard(self.main_program, self.startup_program): - loss = fluid.layers.mean(outs) + loss = paddle.mean(outs) fluid.backward.append_backward(loss) def check_output(self, startup_on_cpu=False, atol=1e-5): diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py index 0c9170242e7de..3f8703b657e8d 100644 --- a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py +++ b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py @@ -41,7 +41,7 @@ def linear_fc(num): for _ in six.moves.xrange(num): hidden = fluid.layers.fc(hidden, size=128, act='relu') loss = fluid.layers.cross_entropy(input=hidden, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss main_program = Program() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py index abe16155d0362..b8272e3bce9da 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py @@ -148,7 +148,7 @@ def test_momentum(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9) @@ -271,7 +271,7 @@ def test_momentum_static(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum( learning_rate=0.1, momentum=0.9) @@ -591,7 +591,7 @@ def _momentum_optimize_static(self, name='X', dtype='float32') hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.fluid.layers.mean(hidden) + loss = paddle.mean(hidden) optimizer.minimize(loss) exe.run(startup_program) if use_amp: diff --git a/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py index 38d5e6e94c066..682a9e3909cc7 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py @@ -107,7 +107,7 @@ def test_api(self, use_mlu=False): x.stop_gradient = x_stop_gradient y.stop_gradient = y_stop_gradient result = paddle.where(cond, x, y) - append_backward(layers.mean(result)) + append_backward(paddle.mean(result)) for use_mlu in [False, True]: place = (paddle.device.MLUPlace(0) if use_mlu else fluid.CPUPlace()) diff --git a/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py index 6c2e24bb16382..93b1f06598de9 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_momentum_op_npu.py @@ -117,7 +117,7 @@ def test_momentum(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9) @@ -243,7 +243,7 @@ def test_momentum_static(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum( learning_rate=0.1, momentum=0.9) diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py index 9d734eac48be0..ada6e0f5f5384 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py @@ -86,7 +86,7 @@ def _test(self, run_npu=True): prob = fluid.layers.softmax(prediction, axis=1) cost = fluid.layers.cross_entropy(input=prob, label=label) - loss = fluid.layers.mean(cost) + loss = paddle.mean(cost) sgd = fluid.optimizer.SGD(learning_rate=0.01) sgd.minimize(loss) diff --git a/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py index c90bf0cb49398..2c3b07c5bb2b9 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_where_op_npu.py @@ -105,7 +105,7 @@ def test_api(self): y.stop_gradient = y_stop_gradient result = paddle.where(cond, x, y) - append_backward(fluid.layers.mean(result)) + append_backward(paddle.mean(result)) exe = fluid.Executor(self.place) exe.run(startup) diff --git a/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py index 22918347a2de3..b3d7f41ec9d4b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_while_op_npu.py @@ -90,7 +90,7 @@ def simple_net(self): layers.array_write(result2, i=j, array=mem_array) layers.less_than(x=j, y=array_len2, cond=cond2) sum_result = layers.array_read(array=mem_array, i=j) - loss = layers.mean(sum_result) + loss = paddle.mean(sum_result) return loss, sum_result def test_simple_net(self): diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py index 93ca1fa5b56a0..4a6905ca66b89 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py @@ -112,7 +112,7 @@ def forward(self, inputs, label): x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) cost = self._fc(x) loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py index 6ee04dd342b81..460e6110aba73 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py @@ -335,7 +335,7 @@ def run_one_loop(self, model, opt, data): out = model(img) softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy(input=softmax_out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) return avg_loss diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py index a8e099137a349..c35e6d37b43e0 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py @@ -94,7 +94,7 @@ def run_one_loop(self, model, opt, data): out = model(img) - out = fluid.layers.mean(out) + out = paddle.mean(out) return out diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py index 90238f56eea24..928d5e4b83f3d 100644 --- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py +++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py @@ -104,7 +104,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): predict = cnn_model(images) with fluid.device_guard("gpu:1"): cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator with fluid.device_guard("gpu:1"): diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py index 3ec8dfb44850e..e0323092164ca 100644 --- a/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py +++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py @@ -104,7 +104,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): predict = cnn_model(images) with fluid.device_guard("gpu:1"): cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator with fluid.device_guard("gpu:1"): diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py index cfc5a4904ac3e..61d7208617dee 100644 --- a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py +++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py @@ -98,7 +98,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): predict = cnn_model(images) with fluid.device_guard("gpu:0"): cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator with fluid.device_guard("gpu:0"): diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py index b014a079b80e3..68af1bdcc9385 100644 --- a/python/paddle/fluid/tests/unittests/seresnext_net.py +++ b/python/paddle/fluid/tests/unittests/seresnext_net.py @@ -17,6 +17,7 @@ fluid.core._set_eager_deletion_mode(-1, -1, False) +import paddle import paddle.fluid.layers.ops as ops from paddle.fluid.layers.learning_rate_scheduler import cosine_decay from simple_nets import init_data @@ -172,7 +173,7 @@ def SE_ResNeXt50Small(use_feed): # Classifier layer: prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py index b9e38d21da831..9326d51591576 100644 --- a/python/paddle/fluid/tests/unittests/simple_nets.py +++ b/python/paddle/fluid/tests/unittests/simple_nets.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle import paddle.fluid as fluid import numpy as np @@ -27,7 +28,7 @@ def simple_fc_net_with_inputs(img, label, class_num=10): value=1.0))) prediction = fluid.layers.fc(hidden, size=class_num, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss @@ -51,7 +52,7 @@ def batchnorm_fc_with_inputs(img, label, class_num=10): prediction = fluid.layers.fc(hidden, size=class_num, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss @@ -87,7 +88,7 @@ def bow_net(use_feed, fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py index 5d96dc38a7103..b6c2a47ac38ae 100644 --- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py +++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py @@ -136,7 +136,7 @@ def test_adadelta(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) rms_optimizer = paddle.optimizer.Adadelta(learning_rate=0.1) rms_optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 61597562a4ab0..428d0e7c21026 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -932,7 +932,7 @@ def test_adam_flatten_param_grads_with_regularizer(self): act=None, param_attr=weight_attr) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) adam = fluid.optimizer.AdamOptimizer(0.01, flatten_param_grads=True, @@ -1149,7 +1149,7 @@ def _adam_optimize_static(self, name='X', dtype='float32') hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.fluid.layers.mean(hidden) + loss = paddle.mean(hidden) optimizer.minimize(loss) exe.run(startup_program) if use_amp: diff --git a/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py b/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py index cc57293a7fa04..1f08eb085a3c5 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py +++ b/python/paddle/fluid/tests/unittests/test_adam_optimizer_fp32_fp64.py @@ -33,7 +33,7 @@ def main_test_func(place, dtype): y = fluid.data(name='y', shape=[None, 1], dtype=dtype) y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) adam_optimizer = fluid.optimizer.AdamOptimizer(0.01) adam_optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index 2ece3d2d8ddf0..e39638d86555e 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -584,7 +584,7 @@ def test_adamw_op(self): fc2_b_mon2 = np.zeros((linear2.bias.shape)).astype("float32") cost = fluid.layers.square_error_cost(input=out, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) simple_lr_fun = partial(simple_lr_setting, decay_rate=0.8, diff --git a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py index 8ed220daf035a..a8d630278f735 100644 --- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py +++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py @@ -44,15 +44,15 @@ def _test_read_write(x): i = layers.increment(x=i) a2 = layers.array_read(array=arr, i=i) - mean_a0 = layers.mean(a0) - mean_a1 = layers.mean(a1) - mean_a2 = layers.mean(a2) + mean_a0 = paddle.mean(a0) + mean_a1 = paddle.mean(a1) + mean_a2 = paddle.mean(a2) a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2]) - mean_x0 = layers.mean(x[0]) - mean_x1 = layers.mean(x[1]) - mean_x2 = layers.mean(x[2]) + mean_x0 = paddle.mean(x[0]) + mean_x1 = paddle.mean(x[1]) + mean_x2 = paddle.mean(x[2]) x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2]) diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py index 31afb85750e8c..147633a62afd1 100644 --- a/python/paddle/fluid/tests/unittests/test_assign_op.py +++ b/python/paddle/fluid/tests/unittests/test_assign_op.py @@ -82,7 +82,7 @@ def test_assign_LoDTensorArray(self): init_array = fluid.layers.array_write(x=z, i=i) array = fluid.layers.assign(init_array) sums = fluid.layers.array_read(array=init_array, i=i) - mean = fluid.layers.mean(sums) + mean = paddle.mean(sums) append_backward(mean) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) @@ -128,7 +128,7 @@ def test_assign_LoDTensorArray(self): init_array = fluid.layers.array_write(x=z, i=i) array = paddle.assign(init_array) sums = fluid.layers.array_read(array=init_array, i=i) - mean = fluid.layers.mean(sums) + mean = paddle.mean(sums) append_backward(mean) place = fluid.CUDAPlace( diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py index 9dee8088ecd96..497457197dcc5 100644 --- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -54,7 +54,7 @@ def convolutional_neural_network(use_py_reader): prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) acc = fluid.layers.accuracy(input=prediction, label=label) i = fluid.layers.zeros(shape=[1], dtype='int64') array = fluid.layers.array_write(x=prediction, i=i) diff --git a/python/paddle/fluid/tests/unittests/test_backward.py b/python/paddle/fluid/tests/unittests/test_backward.py index a6c9caacc7806..b64c9e98654e1 100644 --- a/python/paddle/fluid/tests/unittests/test_backward.py +++ b/python/paddle/fluid/tests/unittests/test_backward.py @@ -180,7 +180,7 @@ def __init__(self): u'softmax', # fc u'elementwise_sub', u'square', - u'mean' + u'reduce_mean' ] # loss self.shape = [16, 50] @@ -235,7 +235,7 @@ def build_model(self): name='fc_no_use') # loss cost = fluid.layers.square_error_cost(input=predict, label=label) - loss = fluid.layers.mean(cost, name='mean_loss') + loss = paddle.mean(cost, name='mean_loss') return loss @@ -308,7 +308,7 @@ def build_net(self): x_emb = fluid.embedding(x, size=[100, 256]) y_predict = fluid.layers.fc(input=x_emb, size=1, name='my_fc') loss = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) param_names = [ param.name for param in fluid.default_main_program().block(0).all_parameters() diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py index 53c578fc6c1e8..92eb35896255d 100644 --- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py @@ -31,7 +31,7 @@ def test_calc_gradient(self): x = layers.create_parameter(dtype="float32", shape=[5, 10]) y = layers.create_parameter(dtype="float32", shape=[10, 8]) mul_out = layers.mul(x=x, y=y) - mean_out = layers.mean(mul_out) + mean_out = paddle.mean(mul_out) a = calc_gradient(mean_out, mul_out) b = calc_gradient(mean_out, x) place = fluid.CPUPlace() diff --git a/python/paddle/fluid/tests/unittests/test_case.py b/python/paddle/fluid/tests/unittests/test_case.py index ed633c758b540..79bb1e0bffd44 100644 --- a/python/paddle/fluid/tests/unittests/test_case.py +++ b/python/paddle/fluid/tests/unittests/test_case.py @@ -17,6 +17,7 @@ import numpy as np import unittest +import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.layers as layers @@ -266,12 +267,12 @@ def test_optimizer_in_case(self): def fn_1(): sum = layers.elementwise_mul(x, y) - loss = layers.mean(sum, name="f_1_loss") + loss = paddle.mean(sum, name="f_1_loss") adam.minimize(loss) def fn_2(): sum = layers.elementwise_mul(x, y) - loss = layers.mean(sum, name="f_2_loss") + loss = paddle.mean(sum, name="f_2_loss") adagrad.minimize(loss) layers.case(pred_fn_pairs=[(switch_id == one, fn_1)], default=fn_2) diff --git a/python/paddle/fluid/tests/unittests/test_communicator_async.py b/python/paddle/fluid/tests/unittests/test_communicator_async.py index f6fd89dc37dae..a6ec339c9b9c3 100644 --- a/python/paddle/fluid/tests/unittests/test_communicator_async.py +++ b/python/paddle/fluid/tests/unittests/test_communicator_async.py @@ -36,7 +36,7 @@ def net(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=x, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost def test_communicator_async(self): diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py index c3f2566d6f7f4..f7593f8bb31fe 100644 --- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py +++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py @@ -53,7 +53,7 @@ def net(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost, x, x1, y def fake_reader(self): diff --git a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py index c4a7edc21f92b..4c06e80547f1f 100644 --- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py +++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py @@ -38,7 +38,7 @@ def net(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost, x, y def fake_reader(self): diff --git a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py index 5726372e40f97..0846eb4dbdb1e 100644 --- a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py @@ -59,7 +59,7 @@ def test_communicator_ps_gpu(self): slots_vars = [x, y] cost = fluid.layers.square_error_cost(input=x, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) optimizer = fluid.optimizer.Adam(0.01) diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py index f13cfd885765a..1380866652f2e 100644 --- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py +++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py @@ -34,7 +34,7 @@ def net(self): x = fluid.layers.data(name='x', shape=[1], dtype='float32') y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=x, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost def test_communicator_sync(self): diff --git a/python/paddle/fluid/tests/unittests/test_compiled_program.py b/python/paddle/fluid/tests/unittests/test_compiled_program.py index e16ac4881c761..fab70b2c6ada4 100644 --- a/python/paddle/fluid/tests/unittests/test_compiled_program.py +++ b/python/paddle/fluid/tests/unittests/test_compiled_program.py @@ -105,7 +105,7 @@ def build_simple_model(self): label = fluid.layers.data(name='label', shape=[1], dtype='int64') prediction = fluid.layers.fc(input=img, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) def compile_program_not_compiled(self): with fluid.program_guard(fluid.Program()): diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py index 1680461305188..1a75b30d4849a 100644 --- a/python/paddle/fluid/tests/unittests/test_cond.py +++ b/python/paddle/fluid/tests/unittests/test_cond.py @@ -17,7 +17,7 @@ import numpy as np import os import unittest - +import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.layers as layers @@ -287,7 +287,7 @@ def greater_equal_branch(i, a): a = 2.0 * i out = layers.cond(i < 5.0, lambda: less_than_branch(i, a), lambda: greater_equal_branch(i, a)) - mean = layers.mean(out) + mean = paddle.mean(out) append_backward(mean) place = fluid.CUDAPlace( @@ -503,10 +503,10 @@ def branch(i, img, label): def cond_func_simple_net_at_true(i, img, label): return layers.cond(i < 5, lambda: branch(i, img, label), - lambda: layers.mean(img)) + lambda: paddle.mean(img)) def cond_func_simple_net_at_false(i, img, label): - return layers.cond(i < 5, lambda: layers.mean(img), + return layers.cond(i < 5, lambda: paddle.mean(img), lambda: branch(i, img, label)) for use_parallel_exe in [False, True]: diff --git a/python/paddle/fluid/tests/unittests/test_conditional_block.py b/python/paddle/fluid/tests/unittests/test_conditional_block.py index 64980115d9ea6..dc59246faa343 100644 --- a/python/paddle/fluid/tests/unittests/test_conditional_block.py +++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py @@ -16,6 +16,7 @@ import numpy as np import unittest +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers import paddle.fluid.core as core @@ -46,7 +47,7 @@ def test_forward(self): outs = exe.run(main_program, feed={'X': x}, fetch_list=[out])[0] print(outs) - loss = layers.mean(out) + loss = paddle.mean(out) append_backward(loss=loss) outs = exe.run( main_program, diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py index 86e23c79d07a4..ed01e7e06f6a9 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_dataset.py @@ -943,7 +943,7 @@ def test_dataset_fleet(self): slots_vars.append(var) fake_cost = \ fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1]) - fake_cost = fluid.layers.mean(fake_cost) + fake_cost = paddle.mean(fake_cost) with fluid.scope_guard(scope): place = fluid.CPUPlace() exe = fluid.Executor(place) @@ -1008,7 +1008,7 @@ def test_dataset_fleet2(self): slots_vars.append(var) fake_cost = \ fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1]) - fake_cost = fluid.layers.mean(fake_cost) + fake_cost = paddle.mean(fake_cost) with fluid.scope_guard(scope): place = fluid.CPUPlace() exe = fluid.Executor(place) @@ -1136,7 +1136,7 @@ def test_bosps_dataset_fleet2(self): slots_vars.append(var) fake_cost = \ fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1]) - fake_cost = fluid.layers.mean(fake_cost) + fake_cost = paddle.mean(fake_cost) with fluid.scope_guard(scope): place = fluid.CPUPlace() exe = fluid.Executor(place) diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py index 75dc36f9bb938..ba89f623b2cef 100644 --- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py +++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py @@ -61,7 +61,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer): predict_label = fluid.layers.fc(hidden, size=CLASS_NUM, act='softmax') - loss = fluid.layers.mean( + loss = paddle.mean( fluid.layers.cross_entropy(input=predict_label, label=label)) optimizer = fluid.optimizer.Adam() diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py index c82ba2bc8cb8e..33ae3f0c6d024 100644 --- a/python/paddle/fluid/tests/unittests/test_desc_clone.py +++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py @@ -77,7 +77,7 @@ def get_model(batch_size): # Train program predict = cnn_model(images) cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) # Evaluator batch_size_tensor = fluid.layers.create_tensor(dtype='int64') @@ -181,7 +181,7 @@ def test_clone_with_stop_gradient(self): loss = fluid.layers.cross_entropy( input=fluid.layers.fc(hidden2, size=10, act='softmax'), label=fluid.layers.data(name='label', shape=[1], dtype='int64')) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) test_program = train_program.clone(for_test=False) self.assertEqual( @@ -217,7 +217,7 @@ def false_fn(): loss = fluid.layers.cross_entropy( input=fluid.layers.fc(hidden2, size=10, act='softmax'), label=fluid.layers.data(name='label', shape=[1], dtype='int64')) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) test_program = train_program.clone(for_test=False) self.assertEqual( @@ -256,7 +256,7 @@ def false_fn(): loss = fluid.layers.cross_entropy( input=fluid.layers.fc(hidden2, size=10, act='softmax'), label=fluid.layers.data(name='label', shape=[1], dtype='int64')) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) test_program = train_program.clone(for_test=False) self.assertRaises(ValueError, train_program._copy_data_info_from, diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py index 38fea7f2413c7..8f66b9098d23f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py @@ -50,7 +50,7 @@ def test_a_sync_optimizer_trainer(self): x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=x, label=y) - avg_cost = paddle.fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False @@ -88,7 +88,7 @@ def test_a_sync_optimizer_pserver(self): x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=x, label=y) - avg_cost = paddle.fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py index 3e683b0d693c0..64ee376c176ce 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py @@ -54,7 +54,7 @@ def test_a_sync_optimizer1(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) os.environ["FLAGS_LAUNCH_BARRIER"] = "0" strategy = paddle.distributed.fleet.DistributedStrategy() diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py index d2ed6ad7ff1de..07dffa9efb14d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py @@ -66,7 +66,7 @@ def test_a_sync_optimizer3(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) os.environ["FLAGS_LAUNCH_BARRIER"] = "0" strategy = paddle.distributed.fleet.DistributedStrategy() diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py index 707f072060a80..d73e5ab16fd88 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py @@ -58,7 +58,7 @@ def test_a_sync_optimizer2(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) os.environ["FLAGS_LAUNCH_BARRIER"] = "0" strategy = paddle.distributed.fleet.DistributedStrategy() strategy.auto = True diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py index 51eb9b81619b7..c96d6768155fd 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py @@ -55,7 +55,7 @@ def test_a_sync_optimizer_trainer(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True @@ -88,7 +88,7 @@ def test_a_sync_optimizer_pserver(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py index 3d7aa1b3fee0d..50b4e867678d4 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py @@ -41,7 +41,7 @@ def test_gradient_merge_optimizer(self): x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=x, label=y) - avg_cost = paddle.fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py index 7e3e5258aed60..67e2c3ffb85ca 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py @@ -66,7 +66,7 @@ def get_loss(cos_q_pt, cos_q_nt): value=0.0, dtype='float32'), loss_op2) - avg_cost = fluid.layers.mean(loss_op3) + avg_cost = paddle.mean(loss_op3) return avg_cost is_distributed = False diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py index ba70a3d1def7f..01560951c0c5d 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py @@ -67,7 +67,7 @@ def get_loss(cos_q_pt, cos_q_nt): value=0.0, dtype='float32'), loss_op2) - avg_cost = fluid.layers.mean(loss_op3) + avg_cost = paddle.mean(loss_op3) return avg_cost is_distributed = False diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py index af61dc7fa3cf9..45d74260b09f6 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py @@ -70,7 +70,7 @@ def get_loss(cos_q_pt, cos_q_nt): value=0.0, dtype='float32'), loss_op2) - avg_cost = fluid.layers.mean(loss_op3) + avg_cost = paddle.mean(loss_op3) return avg_cost is_distributed = False diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py index 243023b4fe1c6..216ea4c2926fd 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py @@ -70,7 +70,7 @@ def get_loss(cos_q_pt, cos_q_nt): value=0.0, dtype='float32'), loss_op2) - avg_cost = fluid.layers.mean(loss_op3) + avg_cost = paddle.mean(loss_op3) return avg_cost is_distributed = False diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py index b8ff052c192cd..d6fe562dc93b9 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py @@ -66,7 +66,7 @@ def get_loss(cos_q_pt, cos_q_nt): value=0.0, dtype='float32'), loss_op2) - avg_cost = fluid.layers.mean(loss_op3) + avg_cost = paddle.mean(loss_op3) return avg_cost is_distributed = False diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py index 32af1959f25db..338ef3af6621e 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py @@ -66,7 +66,7 @@ def get_loss(cos_q_pt, cos_q_nt): value=0.0, dtype='float32'), loss_op2) - avg_cost = fluid.layers.mean(loss_op3) + avg_cost = paddle.mean(loss_op3) return avg_cost is_distributed = False diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py index 63ea8f639aae4..12a65a01cfe6d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py @@ -66,7 +66,7 @@ def get_loss(cos_q_pt, cos_q_nt): value=0.0, dtype='float32'), loss_op2) - avg_cost = fluid.layers.mean(loss_op3) + avg_cost = paddle.mean(loss_op3) return avg_cost is_distributed = False diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py index 692f586a43546..31f3c8d6d8a4f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py @@ -66,7 +66,7 @@ def get_loss(cos_q_pt, cos_q_nt): value=0.0, dtype='float32'), loss_op2) - avg_cost = fluid.layers.mean(loss_op3) + avg_cost = paddle.mean(loss_op3) return avg_cost is_distributed = False diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py index d692528f5bb34..d929edfabd467 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_trainer_desc_config.py @@ -44,7 +44,7 @@ def test_trainer_desc_config(self): x = paddle.fluid.layers.data(name='x', shape=[1], dtype='float32') y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=x, label=y) - avg_cost = paddle.fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py index 265e59ff94919..ce5e79fcd9b31 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py @@ -53,7 +53,7 @@ def test_open_sync_batch_norm(self): data = fluid.layers.data(name='X', shape=[1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) + loss = paddle.mean(hidden) optimizer = fluid.optimizer.AdamOptimizer() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 5905b682d8941..e04638a1d80e6 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -26,6 +26,7 @@ gc.set_debug(gc.DEBUG_COLLECTABLE) +import paddle import paddle.fluid as fluid @@ -51,7 +52,7 @@ def net_conf(self): bias_attr=fluid.ParamAttr(name='fc_b')) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) sgd_optimizer.minimize(avg_cost) @@ -257,7 +258,7 @@ def net_conf(self): bias_attr=fluid.ParamAttr(name='fc_b')) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay(learning_rate=1.0, decay_steps=2100, @@ -402,7 +403,7 @@ def net_conf(self): bias_attr=fluid.ParamAttr(name='fc_b')) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1) opt.minimize(avg_cost) @@ -422,7 +423,7 @@ def net_conf(self): bias_attr=fluid.ParamAttr(name='fc_b')) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) opt = fluid.optimizer.Ftrl(learning_rate=0.1) opt.minimize(avg_cost) @@ -442,7 +443,7 @@ def net_conf(self): bias_attr=fluid.ParamAttr(name='fc_b')) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.piecewise_decay([10000, 20000], [1.0, 0.5, 1.0])) @@ -491,7 +492,7 @@ def net_conf(self): bias_attr=fluid.ParamAttr(name='fc_b')) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) def filter(param): @@ -523,7 +524,7 @@ def net_conf(self): bias_attr=fluid.ParamAttr(name='fc_b')) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) base_lr = 1.0 bd = [1, 10, 20, 30] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] @@ -568,7 +569,7 @@ def net_conf(self): bias_attr=False) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=1.0) sgd_optimizer.minimize(avg_cost) @@ -624,7 +625,7 @@ def emb_pool(ids, table_name, is_distributed): label = fluid.layers.data(name='label', shape=[1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) optimizer = fluid.optimizer.Adam(learning_rate=0.003) optimizer.minimize(avg_cost) @@ -852,7 +853,7 @@ def net_conf(self): bias_attr=fluid.ParamAttr(name='fc_b')) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) optimizer.minimize(avg_cost) @@ -882,7 +883,7 @@ def net_conf(self): bias_attr=fluid.ParamAttr(name='fc_b')) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) optimizer.minimize(avg_cost) @@ -1027,7 +1028,7 @@ def network_with_table(self, is_sparse, is_distributed): seed=1, num_neg_samples=5, is_sparse=is_sparse) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) # optimizer optimizer = fluid.optimizer.Adam(learning_rate=0.003) optimizer.minimize(avg_cost) @@ -1096,7 +1097,7 @@ def network_with_table(self, is_sparse, is_distributed): path_code=path_code, is_custom=True, is_sparse=is_sparse) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) # optimizer optimizer = fluid.optimizer.SGD(learning_rate=0.003) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_distributed_strategy.py index 491555907ec40..92a6715a6424e 100644 --- a/python/paddle/fluid/tests/unittests/test_distributed_strategy.py +++ b/python/paddle/fluid/tests/unittests/test_distributed_strategy.py @@ -13,6 +13,7 @@ # limitations under the License. import unittest +import paddle import paddle.fluid as fluid from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig, ServerRuntimeConfig from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory @@ -238,7 +239,7 @@ def test_debug_info(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) role = role_maker.UserDefinedRoleMaker( current_id=0, diff --git a/python/paddle/fluid/tests/unittests/test_downpoursgd.py b/python/paddle/fluid/tests/unittests/test_downpoursgd.py index 030af8f809e3e..16e9948a7e6d4 100644 --- a/python/paddle/fluid/tests/unittests/test_downpoursgd.py +++ b/python/paddle/fluid/tests/unittests/test_downpoursgd.py @@ -60,7 +60,7 @@ def test_device_work_use_cvm(self): y_predict = fluid.layers.fc(input=x_emb, size=1, act=None) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) ps_param = pslib.PSParameter() with open("{}/fleet_desc.prototxt".format(cache_path)) as f: @@ -120,7 +120,7 @@ def test_device_work(self): y_predict = fluid.layers.fc(input=x_emb, size=1, act=None) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) ps_param = pslib.PSParameter() with open("{}/fleet_desc.prototxt".format(cache_path)) as f: @@ -178,7 +178,7 @@ def test_downpour_opt_work(self): y_predict = fluid.layers.fc(input=x_emb, size=1, act=None) y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) ps_param = pslib.PSParameter() with open("{}/fleet_desc.prototxt".format(cache_path)) as f: diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py index f77f54a636ee7..c0ff50d58cfa7 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py @@ -17,6 +17,7 @@ import unittest import numpy as np +import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear from paddle.fluid.framework import _test_eager_guard @@ -111,7 +112,7 @@ def forward(self, inputs, label): x = fluid.layers.reshape(x, shape=[-1, self.pool_2_shape]) cost = self._linear(x) loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py index 2487bc15660e2..814ef31102fc2 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py @@ -143,7 +143,7 @@ def test_mnist_forward_float32(self): cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) dy_out = avg_loss.numpy() @@ -169,7 +169,7 @@ def test_mnist_forward_float32(self): label = fluid.layers.data(name='label', shape=[1], dtype='int64') cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) # initialize params and fetch them static_param_init_value = {} diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py index 0698a8b40df59..e811fe481f9fc 100644 --- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py @@ -131,7 +131,7 @@ def test_plain_while_op(self): label = fluid.layers.data(name='label', shape=[1], dtype='float32') loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) sgd = fluid.optimizer.SGD(1e-4) sgd.minimize(loss=loss) @@ -174,7 +174,7 @@ def test_train_dynamic_rnn(self): label = fluid.layers.data(name='label', shape=[1], dtype='float32') loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) sgd = fluid.optimizer.Adam(1e-3) sgd.minimize(loss=loss) @@ -242,7 +242,7 @@ def test_train_nested_dynamic_rnn(self): logits = fluid.layers.fc(input=last, size=1, act=None) loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) sgd = fluid.optimizer.SGD(1e-3) sgd.minimize(loss=loss) @@ -303,7 +303,7 @@ def test_train_nested_dynamic_rnn2(self): logits = fluid.layers.fc(input=last, size=1, act=None) loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logits, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) sgd = fluid.optimizer.SGD(1e-3) sgd.minimize(loss=loss) diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py index 0d6fa635a8fd4..e81da693b7f0b 100644 --- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py +++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py @@ -17,6 +17,7 @@ import numpy import random import collections +import paddle import paddle.fluid as fluid import unittest from decorator_helper import * @@ -276,7 +277,7 @@ def test_forward_backward(self): out = rnn() out = fluid.layers.sequence_pool(out, pool_type='last') - loss = fluid.layers.mean(out) + loss = paddle.mean(out) fluid.backward.append_backward(loss) cpu = fluid.CPUPlace() @@ -357,7 +358,7 @@ def test_forward_backward(self): out = rnn() last = fluid.layers.sequence_pool(input=out, pool_type='last') - loss = fluid.layers.mean(last) + loss = paddle.mean(last) fluid.backward.append_backward(loss) cpu = fluid.CPUPlace() diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py index 07f7fa818aa0e..1daa68aa01599 100644 --- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py +++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py @@ -130,7 +130,7 @@ def build_graph(self, only_forward=False): return static_input_step_outs last = fluid.layers.sequence_pool(input=rnn(), pool_type='last') - loss = fluid.layers.mean(last) + loss = paddle.mean(last) append_backward(loss) static_input_grad = self._program.global_block().var( framework.grad_var_name('static_input_tensor')) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py index 4bf8faf25ef44..ed7a3c0f0fe81 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py @@ -44,7 +44,7 @@ def simple_fc_net(): value=1.0))) prediction = fluid.layers.fc(hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) optimizer = fluid.optimizer.Adam(learning_rate=1e-3) optimizer.minimize(loss) return image, label, loss diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py index 39dc0caefd335..ac501a43ca75e 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py @@ -14,6 +14,7 @@ import unittest from test_eager_deletion_dynamic_rnn_base import TestBase +import paddle import paddle.fluid as fluid fluid.core._set_eager_deletion_mode(0.0, 1.0, True) @@ -38,7 +39,7 @@ def gru_net(data, fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py index 07f78d3b84568..bb6f608201573 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py @@ -13,6 +13,7 @@ # limitations under the License. from test_eager_deletion_dynamic_rnn_base import TestBase +import paddle import paddle.fluid as fluid import unittest @@ -40,7 +41,7 @@ def lstm_net(data, fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py index 907e167b5f1d4..195278253e8a3 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py @@ -140,7 +140,7 @@ def setUp(self): self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): - self.output = layers.mean(self.create_rnn_op()) + self.output = paddle.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], @@ -274,7 +274,7 @@ def setUp(self): self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): - self.output = layers.mean(self.create_rnn_op()) + self.output = paddle.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], @@ -375,7 +375,7 @@ def setUp(self): self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): - self.output = layers.mean(self.create_rnn_op()) + self.output = paddle.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], @@ -456,7 +456,7 @@ def setUp(self): self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): - self.output = layers.mean(self.create_rnn_op()) + self.output = paddle.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], @@ -533,7 +533,7 @@ def setUp(self): self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): - self.output = layers.mean(self.create_rnn_op()) + self.output = paddle.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], @@ -654,7 +654,7 @@ def setUp(self): forward_only_rnn.output(h) forward_only_output = forward_only_rnn() forward_only_output.stop_gradient = True - self.forward_only_output = layers.mean(forward_only_output) + self.forward_only_output = paddle.mean(forward_only_output) rnn = layers.StaticRNN() with rnn.step(): @@ -667,7 +667,7 @@ def setUp(self): rnn.update_memory(h_pre, h) rnn.output(h) - self.output = layers.mean(rnn()) + self.output = paddle.mean(rnn()) def forward_two_rnn(self): self.feed_map = { diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py index 41685fa4254bf..52048d798ba02 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py @@ -133,7 +133,7 @@ def run_main(self, place, with_data_parallel): tmp = layers.unsqueeze(sum_result, axes=[0]) tmp = layers.expand(tmp, expand_times=[10, 1]) fc = layers.fc(tmp, size=256) - loss = layers.mean(sum_result) + loss = paddle.mean(sum_result) optim = fluid.optimizer.Adam(learning_rate=1e-3) optim.minimize(loss) diff --git a/python/paddle/fluid/tests/unittests/test_ema.py b/python/paddle/fluid/tests/unittests/test_ema.py index ae0dff4edf9e3..dd3472d31c928 100644 --- a/python/paddle/fluid/tests/unittests/test_ema.py +++ b/python/paddle/fluid/tests/unittests/test_ema.py @@ -16,6 +16,7 @@ import unittest import numpy as np +import paddle import paddle.fluid as fluid @@ -36,7 +37,7 @@ def setUp(self): hidden = fluid.layers.fc(input=data, size=10, param_attr=self._param_name) - cost = fluid.layers.mean(hidden) + cost = paddle.mean(hidden) self._test_program = fluid.default_main_program().clone( for_test=True) diff --git a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py index a1a4a263d936a..74d101497b8ed 100644 --- a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py @@ -59,7 +59,7 @@ def run_program(self, place, stop_gradient=False): x.stop_gradient = stop_gradient emb = fluid.embedding(x, size=[10, 32], dtype='float32') - avg_cost = fluid.layers.mean(emb, name='mean_loss') + avg_cost = paddle.mean(emb, name='mean_loss') optim = fluid.optimizer.SGD(learning_rate=0.001) optim.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_exception.py b/python/paddle/fluid/tests/unittests/test_exception.py index 6e826dacf7ca5..900dbb4b1909f 100644 --- a/python/paddle/fluid/tests/unittests/test_exception.py +++ b/python/paddle/fluid/tests/unittests/test_exception.py @@ -49,7 +49,7 @@ def test_exception_in_static_mode(self): y = fluid.layers.data(name='Y', shape=[-1, 1], dtype='float32') predict = fluid.layers.fc(input=x, size=1, act=None) loss = fluid.layers.square_error_cost(input=predict, label=y) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) fluid.optimizer.SGD(learning_rate=0.01).minimize(avg_loss) diff --git a/python/paddle/fluid/tests/unittests/test_executor_check_feed.py b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py index a35ebfbab173e..120788ac50e93 100644 --- a/python/paddle/fluid/tests/unittests/test_executor_check_feed.py +++ b/python/paddle/fluid/tests/unittests/test_executor_check_feed.py @@ -17,6 +17,7 @@ import unittest import numpy +import paddle import paddle.fluid.core as core import paddle.fluid as fluid @@ -30,7 +31,7 @@ def net(self): y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) opt = fluid.optimizer.Adam(learning_rate=lr) opt.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py b/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py index 05676c34e6def..1362065f81981 100644 --- a/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py @@ -17,6 +17,7 @@ import unittest import numpy +import paddle import paddle.fluid.core as core import paddle.fluid as fluid @@ -30,7 +31,7 @@ def net(self): y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) opt = fluid.optimizer.Adam(learning_rate=lr) opt.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py b/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py index f3fe43e315212..8519beb9615ad 100644 --- a/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py +++ b/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py @@ -61,7 +61,7 @@ def _simple_fc_net(self, in_size, label_size, class_num, hidden_sizes): hidden = fluid.layers.fc(hidden, size=hidden_size) predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax') - loss = fluid.layers.mean( + loss = paddle.mean( fluid.layers.cross_entropy(input=predict_label, label=label)) optimizer = fluid.optimizer.Adam() diff --git a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py index 2e48157f950f8..ab8ea8d3e4bd3 100644 --- a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py +++ b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py @@ -44,7 +44,7 @@ def conv_net(self, img, label): hidden = fluid.layers.fc(input=conv_pool_2, size=32, act='relu') prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss, prediction def build_program(self, main, startup, is_test): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_api_input.py b/python/paddle/fluid/tests/unittests/test_fleet_api_input.py index 139ce121ad587..edcaa54a7f884 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_api_input.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_api_input.py @@ -15,6 +15,7 @@ from __future__ import print_function import unittest +import paddle import paddle.fluid as fluid from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig from paddle.fluid.incubate.fleet.base.role_maker import UserDefinedRoleMaker @@ -56,7 +57,7 @@ def testInvalidInputs(self): data = fluid.layers.data(name='X', shape=[1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) + loss = paddle.mean(hidden) adam = fluid.optimizer.Adam() adam.minimize(loss) place = fluid.CPUPlace() @@ -156,7 +157,7 @@ def testInvalidInputs(self): self.assertRaises(Exception, transpiler.minimize, loss=[]) data = fluid.layers.data(name='X', shape=[1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) + loss = paddle.mean(hidden) self.assertRaises(Exception, transpiler.minimize, loss=loss.name, diff --git a/python/paddle/fluid/tests/unittests/test_fleet_auto.py b/python/paddle/fluid/tests/unittests/test_fleet_auto.py index 460ef27f63c18..7f37e7b2a44ac 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_auto.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py @@ -42,7 +42,7 @@ def test_distributed_strategy_auto(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.auto = True diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py index 5b87f215feff7..ed914d2866510 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py @@ -55,7 +55,7 @@ def test_ps_minimize(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) role = fleet.PaddleCloudRoleMaker(is_collective=False) fleet.init(role) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py index 5e6aabe308ec1..c81d96bafd332 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base_3.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base_3.py @@ -42,7 +42,7 @@ def test_collective_minimize(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) @@ -72,7 +72,7 @@ def test_fleet_get_applied_optimizer(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) fleet.init(is_collective=True) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py index f48b166f97035..fb78f1f1add0c 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py @@ -13,6 +13,7 @@ # limitations under the License. import unittest +import paddle import paddle.fluid as fluid import paddle.fluid.incubate.fleet.base.role_maker as role_maker from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet @@ -43,7 +44,7 @@ def _test_checkpoint(self, fs, dir_path): place=fluid.CPUPlace()) predict = fluid.layers.fc(input=image, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=predict, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001) dist_optimizer = fleet.distributed_optimizer(optimizer) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py index d7de5ef3d40eb..363843dd5e839 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_fp16_allreduce_meta_optimizer.py @@ -44,7 +44,7 @@ def net(self, main_prog, startup_prog, dtype='float32'): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.fp16_allreduce = True diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py index 6ca078cdde7f5..bc6a554f84d8f 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py @@ -80,7 +80,7 @@ def node_func(): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) @@ -148,7 +148,7 @@ def node_func(): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.nccl_comm_num = 2 @@ -228,7 +228,7 @@ def node_func(): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) @@ -295,7 +295,7 @@ def node_func(): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.nccl_comm_num = 2 diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py index 2afe4af3645f2..af2a8a1465c3c 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_executor.py @@ -58,7 +58,7 @@ def node_func(): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.nccl_comm_num = 2 diff --git a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py index 928ea06a611d4..3062812223d64 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py @@ -79,9 +79,9 @@ def test_opt_sharding_with_pp(self): self.assertEqual(main_prog_op_types, [ 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', - 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add', 'softmax', 'cross_entropy2', 'reduce_mean', + 'fill_constant', 'reduce_mean_grad', 'cross_entropy_grad2', + 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', @@ -161,9 +161,9 @@ def test_opt_sharding_with_pp_with_allreduce_fuse(self): self.assertEqual(main_prog_op_types, [ 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', - 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add', 'softmax', 'cross_entropy2', 'reduce_mean', + 'fill_constant', 'reduce_mean_grad', 'cross_entropy_grad2', + 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', @@ -228,8 +228,8 @@ def test_opt_sharding_with_pp_amp_gclip(self): 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'softmax', - 'cast', 'cross_entropy2', 'mean', 'elementwise_mul', - 'fill_constant', 'elementwise_mul_grad', 'mean_grad', + 'cast', 'cross_entropy2', 'reduce_mean', 'elementwise_mul', + 'fill_constant', 'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', @@ -305,10 +305,10 @@ def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self): 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'softmax', - 'cast', 'cross_entropy2', 'mean', 'elementwise_mul', + 'cast', 'cross_entropy2', 'reduce_mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', + 'reduce_mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', @@ -386,22 +386,22 @@ def test_opt_sharding_with_pp_amp_ckp_fuse_gm_optcast(self): 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'softmax', 'cast', 'cross_entropy2', - 'mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', + 'reduce_mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', - 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'cast', - 'elementwise_add_grad', 'cast', 'mul_grad', 'cast', 'tanh_grad', - 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', - 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul', - 'elementwise_add', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream', - 'send_v2', 'cast', 'sum', 'sum', 'cast', 'sum', 'c_reduce_sum', - 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', - 'check_finite_and_unscale', 'cast', 'c_allreduce_max', - 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', - 'cast', 'momentum', 'cast', 'momentum', 'cast', 'momentum', - 'momentum', 'cast', 'coalesce_tensor', 'c_broadcast', 'c_broadcast', - 'coalesce_tensor', 'c_broadcast' + 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', + 'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2', + 'cast', 'softmax_grad', 'cast', 'elementwise_add_grad', 'cast', + 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', + 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', + 'mul_grad', 'cast', 'cast', 'mul', 'elementwise_add', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', + 'c_sync_calc_stream', 'send_v2', 'cast', 'sum', 'sum', 'cast', + 'sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast', + 'c_allreduce_max', 'c_allreduce_max', 'cast', 'update_loss_scaling', + 'momentum', 'cast', 'momentum', 'cast', 'momentum', 'cast', + 'momentum', 'momentum', 'cast', 'coalesce_tensor', 'c_broadcast', + 'c_broadcast', 'coalesce_tensor', 'c_broadcast' ]) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py index f6f3f50be0dee..1c20d2e45be03 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py @@ -48,7 +48,7 @@ def net(self, main_prog, startup_prog): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.lamb = True @@ -120,7 +120,7 @@ def test_lamb_apply_with_amp(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.amp = True diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py index b4f0c93d09ccc..b560cdaa66ef4 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py @@ -48,7 +48,7 @@ def net(self, main_prog, startup_prog): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.lars = True @@ -121,7 +121,7 @@ def test_lars_apply_with_amp(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.amp = True diff --git a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py index f39f916dbbe64..21246cb74c442 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_meta_optimizer_base.py @@ -44,7 +44,7 @@ def net(main_prog, startup_prog): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) opt = MetaOptimizerBase(optimizer) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py index d9bc0c7a5f39c..279a2e21f70ef 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py @@ -57,7 +57,7 @@ def net(self): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost def test_pipeline_optimizer(self): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py index 5c086a5994f0b..c45c81c35b42b 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py @@ -52,7 +52,7 @@ def test_pipeline_optimizer(self): act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.pipeline = True diff --git a/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py index 05c3391565ea2..3fde52958d353 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py @@ -41,7 +41,7 @@ def test_pipeline_optimizer(self): prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy(input=prediction, label=input_y) - avg_cost = paddle.fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.without_graph_optimization = True diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py index 7fc68ec15636a..8952f01dd6df5 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py @@ -446,7 +446,7 @@ def net(): y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32') cost = paddle.fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost from paddle.distributed import fleet diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 20eace7cce3c0..68ad29880b372 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -52,14 +52,14 @@ def test_sharding_optimizer(self): 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', - 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', + 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', + 'reduce_mean', 'fill_constant', 'reduce_mean_grad', + 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', + 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', - 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'momentum', - 'momentum', 'momentum' + 'c_sync_comm_stream', 'momentum', 'momentum', 'momentum' ]) def test_sharding_amp_optimizer(self): @@ -92,16 +92,16 @@ def test_sharding_amp_optimizer(self): 'c_sync_comm_stream', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2', - 'mean', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', + 'reduce_mean', 'elementwise_mul', 'fill_constant', + 'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2', + 'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', + 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', - 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast', - 'cast', 'cast', 'check_finite_and_unscale', 'cast', - 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', - 'momentum', 'momentum' + 'c_sync_comm_stream', 'cast', 'cast', 'cast', + 'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'cast', + 'update_loss_scaling', 'momentum', 'momentum', 'momentum' ]) def test_sharding_recompute_optimizer(self): @@ -132,11 +132,12 @@ def test_sharding_recompute_optimizer(self): 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', - 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', + 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', + 'reduce_mean', 'fill_constant', 'reduce_mean_grad', + 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', + 'mul_grad', 'mul', 'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'mul', 'elementwise_add', - 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'mul', - 'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'momentum', 'momentum', 'momentum' @@ -177,9 +178,9 @@ def test_sharding_amp_recompute_optimizer(self): 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', - 'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul', - 'fill_constant', 'elementwise_mul_grad', 'mean_grad', - 'cross_entropy_grad2', 'cast', 'softmax_grad', + 'softmax', 'cast', 'cross_entropy2', 'reduce_mean', + 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad', + 'reduce_mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'mul', @@ -227,16 +228,17 @@ def test_sharding_amp_asp_optimizer(self): 'c_sync_comm_stream', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2', - 'mean', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', + 'reduce_mean', 'elementwise_mul', 'fill_constant', + 'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2', + 'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', + 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', - 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast', - 'cast', 'cast', 'check_finite_and_unscale', 'cast', - 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', - 'momentum', 'momentum', 'elementwise_mul' + 'c_sync_comm_stream', 'cast', 'cast', 'cast', + 'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'cast', + 'update_loss_scaling', 'momentum', 'momentum', 'momentum', + 'elementwise_mul' ]) def test_sharding_weight_decay(self): @@ -268,15 +270,15 @@ def test_sharding_weight_decay(self): 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', - 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', + 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', + 'reduce_mean', 'fill_constant', 'reduce_mean_grad', + 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', + 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', - 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'scale', - 'sum', 'scale', 'sum', 'scale', 'sum', 'momentum', 'momentum', - 'momentum' + 'c_sync_comm_stream', 'scale', 'sum', 'scale', 'sum', 'scale', + 'sum', 'momentum', 'momentum', 'momentum' ]) def test_sharding_gradient_clip(self): @@ -308,17 +310,18 @@ def test_sharding_gradient_clip(self): 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', - 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', + 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', + 'reduce_mean', 'fill_constant', 'reduce_mean_grad', + 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', + 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', - 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', - 'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'sum', - 'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max', - 'elementwise_div', 'elementwise_mul', 'elementwise_mul', - 'elementwise_mul', 'momentum', 'momentum', 'momentum' + 'c_sync_comm_stream', 'squared_l2_norm', 'squared_l2_norm', + 'squared_l2_norm', 'sum', 'c_allreduce_sum', 'sqrt', + 'fill_constant', 'elementwise_max', 'elementwise_div', + 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'momentum', + 'momentum', 'momentum' ]) def test_sharding_clone_for_test(self): @@ -338,7 +341,7 @@ def test_sharding_clone_for_test(self): 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', - 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean' + 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'reduce_mean' ]) @@ -464,15 +467,16 @@ def test_sharding_hybrid_dp(self): 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', - 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', + 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', + 'reduce_mean', 'fill_constant', 'reduce_mean_grad', + 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', + 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', - 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_sync_comm_stream', 'momentum', 'momentum', 'momentum' + 'c_sync_comm_stream', 'c_allreduce_sum', 'c_allreduce_sum', + 'c_allreduce_sum', 'c_sync_comm_stream', 'momentum', 'momentum', + 'momentum' ]) def test_sharding_hybrid_dp_gm(self): @@ -527,15 +531,16 @@ def test_sharding_hybrid_dp_gm(self): 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', - 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', + 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', + 'reduce_mean', 'fill_constant', 'reduce_mean_grad', + 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', + 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', - 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', - 'elementwise_add', 'elementwise_add', 'elementwise_add', - 'increment', 'elementwise_mod', 'equal', 'conditional_block' + 'c_sync_comm_stream', 'elementwise_add', 'elementwise_add', + 'elementwise_add', 'increment', 'elementwise_mod', 'equal', + 'conditional_block' ]) self.assertEqual(opt_ops, [ 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', 'scale', @@ -597,10 +602,11 @@ def test_sharding_with_pp(self): 'c_broadcast', 'c_sync_comm_stream', 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax', - 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad', - 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', - 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', - 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'cross_entropy2', 'reduce_mean', 'fill_constant', + 'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', @@ -720,26 +726,26 @@ def test_hybrid_with_mp_pp_amp_gclip(self): 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', - 'elementwise_add', 'softmax', 'cast', 'cross_entropy2', 'mean', - 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream', - 'partial_send', 'fill_constant', 'cast', 'sum', 'fill_constant', - 'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', - 'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', - 'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', - 'cast', 'sum', 'c_sync_comm_stream', 'check_finite_and_unscale', - 'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast', - 'update_loss_scaling', 'fill_constant', 'c_allreduce_sum', - 'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max', - 'elementwise_div', 'elementwise_mul', 'elementwise_mul', + 'elementwise_add', 'softmax', 'cast', 'cross_entropy2', + 'reduce_mean', 'elementwise_mul', 'fill_constant', + 'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2', + 'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', + 'c_sync_calc_stream', 'partial_send', 'fill_constant', 'cast', + 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast', + 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast', + 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast', + 'sum', 'fill_constant', 'cast', 'sum', 'c_sync_comm_stream', + 'check_finite_and_unscale', 'cast', 'c_allreduce_max', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'fill_constant', + 'c_allreduce_sum', 'c_allreduce_sum', 'sqrt', 'fill_constant', + 'elementwise_max', 'elementwise_div', 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', - 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'momentum', - 'momentum', 'momentum', 'momentum', 'momentum', 'momentum', - 'momentum', 'momentum' + 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', + 'elementwise_mul', 'momentum', 'momentum', 'momentum', 'momentum', + 'momentum', 'momentum', 'momentum', 'momentum' ]) # pp + mp, partial send recv @@ -839,25 +845,26 @@ def test_hybrid_with_mp_pp_amp_gclip_for_optimizer(self): 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', - 'elementwise_add', 'softmax', 'cast', 'cross_entropy2', 'mean', - 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', - 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', - 'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream', - 'partial_send', 'fill_constant', 'cast', 'sum', 'fill_constant', - 'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', - 'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', - 'cast', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', - 'cast', 'sum', 'c_sync_comm_stream', 'check_finite_and_unscale', - 'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast', - 'update_loss_scaling', 'memcpy', 'fill_constant', 'c_allreduce_sum', - 'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max', - 'elementwise_div', 'elementwise_mul', 'elementwise_mul', + 'elementwise_add', 'softmax', 'cast', 'cross_entropy2', + 'reduce_mean', 'elementwise_mul', 'fill_constant', + 'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2', + 'cast', 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', + 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', + 'c_sync_calc_stream', 'partial_send', 'fill_constant', 'cast', + 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast', + 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast', + 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'cast', + 'sum', 'fill_constant', 'cast', 'sum', 'c_sync_comm_stream', + 'check_finite_and_unscale', 'cast', 'c_allreduce_max', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'memcpy', + 'fill_constant', 'c_allreduce_sum', 'c_allreduce_sum', 'sqrt', + 'fill_constant', 'elementwise_max', 'elementwise_div', 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', - 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'adamw', - 'adamw', 'adamw', 'adamw', 'adamw', 'adamw', 'adamw', 'adamw' + 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', + 'elementwise_mul', 'elementwise_mul', 'adamw', 'adamw', 'adamw', + 'adamw', 'adamw', 'adamw', 'adamw', 'adamw' ]) # pp + mp, partial send recv @@ -948,8 +955,8 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce(self): 'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', - 'softmax', 'cross_entropy2', 'mean', 'elementwise_mul', - 'fill_constant', 'elementwise_mul_grad', 'mean_grad', + 'softmax', 'cross_entropy2', 'reduce_mean', 'elementwise_mul', + 'fill_constant', 'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', @@ -1119,11 +1126,11 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast(self): self.assertEqual(main_prog_op_types, [ 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', - 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', - 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', + 'reduce_mean', 'elementwise_mul', 'fill_constant', + 'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2', + 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'fill_constant', 'cast', 'sum', 'fill_constant', 'sum', @@ -1218,11 +1225,11 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self): self.assertEqual(main_prog_op_types, [ 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', - 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', - 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', + 'reduce_mean', 'elementwise_mul', 'fill_constant', + 'elementwise_mul_grad', 'reduce_mean_grad', 'cross_entropy_grad2', + 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'fill_constant', 'cast', 'sum', 'fill_constant', 'sum', @@ -1320,13 +1327,13 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast_with_gradient_fuse( self.assertEqual(main_prog_op_types, [ 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', - 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', - 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', - 'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', - 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', + 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', + 'reduce_mean', 'elementwise_mul', 'coalesce_tensor', + 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', + 'fill_constant', 'elementwise_mul_grad', 'reduce_mean_grad', + 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', + 'cast', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', - 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'sum', 'cast', 'sum', 'c_allreduce_sum', 'c_allreduce_sum', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', @@ -1415,10 +1422,10 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse(self): 'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', - 'softmax', 'cross_entropy2', 'mean', 'elementwise_mul', + 'softmax', 'cross_entropy2', 'reduce_mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', + 'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', @@ -1511,10 +1518,10 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse_and_avg_after_sum(self): 'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', - 'softmax', 'cross_entropy2', 'mean', 'elementwise_mul', + 'softmax', 'cross_entropy2', 'reduce_mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', + 'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', @@ -1574,11 +1581,12 @@ def test_hybrid_with_pp_dp_with_gradient_fuse_and_avg_after_sum(self): self.assertEqual(main_prog_op_types, [ 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', - 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', - 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'mean_grad', - 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', - 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', - 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add', 'softmax', 'cross_entropy2', 'reduce_mean', + 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', + 'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'sum', 'c_allreduce_sum', 'c_sync_comm_stream', 'scale', 'momentum', 'momentum', 'momentum', 'momentum', 'momentum', 'momentum', @@ -1640,10 +1648,10 @@ def test_hybrid_with_pp_dp_with_amp_no_dynamic_gradient_fuse_and_avg_after_sum( 'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', - 'softmax', 'cross_entropy2', 'mean', 'elementwise_mul', + 'softmax', 'cross_entropy2', 'reduce_mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', - 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', + 'reduce_mean_grad', 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py index c8106db13300f..d3a18ad28ca54 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py @@ -48,7 +48,7 @@ def build_program(self, main_program, startup_program, use_cuda, seed=1): data_layout='NHWC') prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=y) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) sgd = fluid.optimizer.SGD(learning_rate=0.001) if use_cuda: sgd = fluid.contrib.mixed_precision.decorate( diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py index 59b85530f10da..08141c44395e3 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py @@ -94,7 +94,7 @@ def build_fused_program(self, act='softmax', param_attr=self.fc_param_attr) loss = fluid.layers.cross_entropy(input=prediction, label=y) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) sgd = fluid.optimizer.SGD(learning_rate=0.001) sgd = fluid.contrib.mixed_precision.decorate( sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0) @@ -144,7 +144,7 @@ def build_origin_program(self, act='softmax', param_attr=self.fc_param_attr) loss = fluid.layers.cross_entropy(input=prediction, label=y) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) sgd = fluid.optimizer.SGD(learning_rate=0.001) sgd = fluid.contrib.mixed_precision.decorate( sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0) diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py index cddc05f591444..a9e9a588e857d 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py @@ -62,7 +62,7 @@ def simple_depthwise_net(use_feed): hidden = fluid.layers.relu(hidden) prediction = fluid.layers.fc(hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py index 674c0b4d12fe4..7c1a80d229626 100644 --- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py +++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py @@ -63,7 +63,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer): predict_label = fluid.layers.fc(hidden, size=CLASS_NUM, act='softmax') - loss = fluid.layers.mean( + loss = paddle.mean( fluid.layers.cross_entropy(input=predict_label, label=label)) optimizer = fluid.optimizer.Adam() diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py index dfdb3c32dc232..e84a3f0329623 100644 --- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -47,7 +47,7 @@ def bow_net(data, fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost @@ -89,7 +89,7 @@ def check_gradient_clip(self, place, dtype='float32'): predict = fluid.layers.fc(input=hidden, size=10, act='softmax') cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) prog_clip = prog.clone() avg_cost_clip = prog_clip.block(0).var(avg_cost.name) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 9649e9c68eda2..d59cdc3e328e2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -82,7 +82,7 @@ def amp_guard_black_op(self): with fluid.dygraph.guard(): data = fluid.dygraph.to_variable(data) with fluid.dygraph.amp_guard(True): - out_fp32 = fluid.layers.mean(data) + out_fp32 = paddle.mean(data) self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32) self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32) @@ -222,7 +222,7 @@ def run_simple_conv(inp_np, use_scaler=True): data = fluid.dygraph.to_variable(inp_np) out = model(data) - loss = fluid.layers.mean(out) + loss = paddle.mean(out) if use_scaler: print('use scaler') scaled_loss = scaler.scale(loss) @@ -273,7 +273,7 @@ def run_simple_conv(inp_np, use_scaler=True): data = fluid.dygraph.to_variable(inp_np) out = model(data) - loss = fluid.layers.mean(out) + loss = paddle.mean(out) if use_scaler: print('use scaler') scaled_loss = scaler.scale(loss) @@ -316,7 +316,7 @@ def nan_inf(self): data = fluid.dygraph.to_variable(inp_np) out = model(data) - loss = fluid.layers.mean(out) + loss = paddle.mean(out) scaled_loss = scaler.scale(loss) scaled_loss.backward() optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss) @@ -1215,7 +1215,7 @@ def train_resnet(self, enable_amp=True, level='O1'): out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) dy_out = avg_loss.numpy() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py index d12b002f04ef8..3b1a0436556b1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision_for_eager.py @@ -81,7 +81,7 @@ def amp_guard_black_op(self): with fluid.dygraph.guard(): data = fluid.dygraph.to_variable(data) with fluid.dygraph.amp_guard(True): - out_fp32 = fluid.layers.mean(data) + out_fp32 = paddle.mean(data) self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32) self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32) @@ -221,7 +221,7 @@ def run_simple_conv(inp_np, use_scaler=True): data = fluid.dygraph.to_variable(inp_np) out = model(data) - loss = fluid.layers.mean(out) + loss = paddle.mean(out) if use_scaler: print('use scaler') scaled_loss = scaler.scale(loss) @@ -272,7 +272,7 @@ def run_simple_conv(inp_np, use_scaler=True): data = fluid.dygraph.to_variable(inp_np) out = model(data) - loss = fluid.layers.mean(out) + loss = paddle.mean(out) if use_scaler: print('use scaler') scaled_loss = scaler.scale(loss) @@ -315,7 +315,7 @@ def nan_inf(self): data = fluid.dygraph.to_variable(inp_np) out = model(data) - loss = fluid.layers.mean(out) + loss = paddle.mean(out) scaled_loss = scaler.scale(loss) scaled_loss.backward() optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss) @@ -1206,7 +1206,7 @@ def train_resnet(self, enable_amp=True, level='O1'): out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) dy_out = avg_loss.numpy() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py index 4dee7cf963348..7a5934b4fdc79 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py @@ -13,6 +13,7 @@ # limitations under the License. import unittest +import paddle import paddle.fluid as fluid import numpy as np from paddle.fluid.framework import _test_eager_guard @@ -79,7 +80,7 @@ def forward(self, x, label): label = fluid.layers.cast(label, dtype='int64') # Note that the label is not persistable in fluid.layers.cross_entropy. loss = fluid.layers.cross_entropy(input=feature, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss @@ -96,7 +97,7 @@ def forward(self, x, label, test_num): dim=1) # Note that: part2 is not used. loss = fluid.layers.cross_entropy(input=part1, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) if test_num == 1: return loss, part2 else: @@ -460,7 +461,7 @@ def func_case3_prune_no_grad_branch2(self): label = fluid.layers.cast(label, dtype="float32") label = fluid.layers.cast(label, dtype='int64') out = fluid.layers.one_hot(input=label, depth=100) - loss = fluid.layers.mean(out) + loss = paddle.mean(out) loss.backward() self.assertTrue(linear.weight._grad_ivar() is None) @@ -472,7 +473,7 @@ def test_case3_prune_no_grad_branch2(self): def func_case4_with_no_grad_op_maker(self): with fluid.dygraph.guard(): out = fluid.layers.gaussian_random(shape=[20, 30]) - loss = fluid.layers.mean(out) + loss = paddle.mean(out) loss.backward() self.assertTrue(out._grad_ivar() is None) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py index aeead6ff74745..a365b00e9129c 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -178,7 +178,7 @@ def func_test_mnist_float32(self): helper.assertEachVar(cost, cost_static) loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) dy_out = avg_loss.numpy() @@ -213,7 +213,7 @@ def func_test_mnist_float32(self): label = fluid.layers.data(name='label', shape=[1], dtype='int64') cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) sgd.minimize(avg_loss) # initialize params and fetch them diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py index 23af23a4286ea..18094024b4a10 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py @@ -63,7 +63,7 @@ def func_test_mnist_sort_gradient_float32(self): cost2 = mnist2(img2) loss2 = fluid.layers.cross_entropy(cost2, label2) - avg_loss2 = fluid.layers.mean(loss2) + avg_loss2 = paddle.mean(loss2) dy_out2 = avg_loss2.numpy() @@ -100,7 +100,7 @@ def func_test_mnist_sort_gradient_float32(self): label = fluid.layers.data(name='label', shape=[1], dtype='int64') cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) sgd.minimize(avg_loss) # initialize params and fetch them diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py index 69ebf875b3d0b..0371176d7824f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -309,7 +309,7 @@ def func_test_resnet_float32(self): resnet.train() loss = fluid.layers.cross_entropy(input=out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) dy_out = avg_loss.numpy() @@ -356,7 +356,7 @@ def func_test_resnet_float32(self): label = fluid.layers.data(name='label', shape=[1], dtype='int64') out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) optimizer.minimize(avg_loss) # initialize params and fetch them diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py index 0a1d1c0cfb315..4942e1db76968 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py @@ -112,7 +112,7 @@ def func_test_resnet_sort_gradient_float32(self): out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) dy_out = avg_loss.numpy() @@ -161,7 +161,7 @@ def func_test_resnet_sort_gradient_float32(self): label = fluid.layers.data(name='label', shape=[1], dtype='int64') out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) optimizer.minimize(avg_loss) # initialize params and fetch them diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py index 245982c71ccc2..fa2d470fc5e79 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py @@ -351,7 +351,7 @@ def run_dygraph(): softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy(input=softmax_out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) dy_out = avg_loss.numpy() @@ -410,7 +410,7 @@ def run_dygraph(): out = se_resnext(img) softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy(input=softmax_out, label=label) - avg_loss = fluid.layers.mean(x=loss) + avg_loss = paddle.mean(x=loss) optimizer.minimize(avg_loss) # initialize params and fetch them diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py index 092478bbf2ae1..b8ea449c2b254 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py @@ -427,7 +427,7 @@ def get_generator_loss(image_real, label_org, label_trg, generator, pred_fake, cls_fake = discriminator(fake_img) - g_loss_fake = -fluid.layers.mean(pred_fake) + g_loss_fake = -paddle.mean(pred_fake) g_loss_cls = loss_cls(cls_fake, label_trg, cfg) g_loss = g_loss_fake + cfg.lambda_rec * g_loss_rec + g_loss_cls return g_loss @@ -439,8 +439,8 @@ def get_discriminator_loss(image_real, label_org, label_trg, generator, pred_real, cls_real = discriminator(image_real) pred_fake, _ = discriminator(fake_img) d_loss_cls = loss_cls(cls_real, label_org, cfg) - d_loss_fake = fluid.layers.mean(pred_fake) - d_loss_real = -fluid.layers.mean(pred_real) + d_loss_fake = paddle.mean(pred_fake) + d_loss_real = -paddle.mean(pred_real) d_loss = d_loss_real + d_loss_fake + d_loss_cls d_loss_gp = gradient_penalty(discriminator, image_real, fake_img, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py index 619e1ba37d60c..d031cd84683da 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_mnist.py @@ -52,7 +52,7 @@ def static_train_net(img, label): prediction = convolutional_neural_network(img) loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) optimizer = fluid.optimizer.SGD(learning_rate=0.001) optimizer.minimize(avg_loss) @@ -159,7 +159,7 @@ def load_and_train_dygraph(self): cost = mnist(img) loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) avg_loss.backward() sgd.minimize(avg_loss) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py index 6c90b8348714c..0c4dad64adaea 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_static_runner_while.py @@ -86,7 +86,7 @@ def train_and_save_model(self): pred = while_softmax_regression(img) loss = fluid.layers.cross_entropy(input=pred, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) optimizer = fluid.optimizer.SGD(learning_rate=0.001) optimizer.minimize(avg_loss) @@ -144,7 +144,7 @@ def load_and_train_dygraph(self): cost = while_net(img) loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) avg_loss.backward() sgd.minimize(avg_loss) @@ -169,7 +169,7 @@ def load_and_train_static(self): pred = while_softmax_regression(img) loss = fluid.layers.cross_entropy(input=pred, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) optimizer = fluid.optimizer.SGD(learning_rate=0.001) optimizer.minimize(avg_loss) diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py index 89b7771700f57..431b8be2a779e 100644 --- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py +++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py @@ -61,7 +61,7 @@ def test_fit_line_inference_model(self): y_predict = layers.fc(input=x, size=1, act=None) cost = layers.square_error_cost(input=y_predict, label=y) - avg_cost = layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer.minimize(avg_cost, init_program) @@ -142,7 +142,7 @@ def test_save_inference_model(self): y_predict = layers.fc(input=x, size=1, act=None) cost = layers.square_error_cost(input=y_predict, label=y) - avg_cost = layers.mean(cost) + avg_cost = paddle.mean(cost) place = core.CPUPlace() exe = executor.Executor(place) @@ -166,7 +166,7 @@ def test_save_inference_model_with_auc(self): auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict, label=y) cost = fluid.layers.cross_entropy(input=predict, label=y) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) place = core.CPUPlace() exe = executor.Executor(place) @@ -197,7 +197,7 @@ def test_save_inference_model(self): y_predict = layers.fc(input=x, size=1, act=None) cost = layers.square_error_cost(input=y_predict, label=y) - avg_cost = layers.mean(cost) + avg_cost = paddle.mean(cost) place = core.CPUPlace() exe = executor.Executor(place) @@ -230,7 +230,7 @@ def test_save_and_load_inference_model(self): y_predict = layers.fc(input=x, size=1, act=None) cost = layers.square_error_cost(input=y_predict, label=y) - avg_cost = layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer.minimize(avg_cost, init_program) @@ -350,7 +350,7 @@ def test_serialize_program_and_persistables(self): y_predict = layers.fc(input=x, size=1, act=None) cost = layers.square_error_cost(input=y_predict, label=y) - avg_cost = layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer.minimize(avg_cost, init_program) @@ -393,7 +393,7 @@ def test_normalize_program(self): y_predict = layers.fc(input=x, size=1, act=None) cost = layers.square_error_cost(input=y_predict, label=y) - avg_cost = layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer.minimize(avg_cost, init_program) diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py index f45ada0a52980..6b4e3602fe641 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py +++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py @@ -17,6 +17,7 @@ import os import unittest import numpy as np +import paddle import paddle.fluid.core as core import paddle.fluid as fluid from parallel_executor_test_base import TestParallelExecutorBase, DeviceType @@ -38,7 +39,7 @@ def fc_with_batchnorm(use_feed): hidden = fluid.layers.batch_norm(input=hidden) prediction = fluid.layers.fc(hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py index cd34e9070213a..fa3adfb9e99f5 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py @@ -65,7 +65,7 @@ def check_network_convergence(self, prob = ie() loss = layers.cross_entropy(input=prob[0], label=label) - avg_loss = layers.mean(loss) + avg_loss = paddle.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py index 360457000befd..ed4dd80885d31 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py @@ -15,7 +15,7 @@ # nlp model stack of op operate on lod. It's a classical test case in optimize pass. from __future__ import print_function - +import paddle import paddle.fluid as fluid import unittest from ir_memory_optimize_net_base import TestIrMemOptBase @@ -43,7 +43,7 @@ def lstm_net(data, fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py index 4b775197aaea1..ac57e1b92243f 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py @@ -36,7 +36,7 @@ def simple_fc_net(use_feed): x = fluid.layers.fc(input=x, size=20, act='relu') y_predict = fluid.layers.fc(input=x, size=10, act='softmax') cost = fluid.layers.cross_entropy(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost @@ -49,7 +49,7 @@ def fc_with_inplace_net(use_feed): reshape = fluid.layers.reshape(x=reshape, shape=[-1, 5, 2]) y_predict = fluid.layers.fc(input=reshape, size=10, act='softmax') cost = fluid.layers.cross_entropy(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py index fd4129f47ff65..eab86141ba6b1 100644 --- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py @@ -96,7 +96,7 @@ def __init__(self, in_size, out_size): def forward(self, x, label): out = self._linear(x) loss = fluid.layers.cross_entropy(out, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return out, avg_loss @@ -113,7 +113,7 @@ def __init__(self, in_size, out_size): def forward(self, x, label): out = self._linear(x) loss = fluid.layers.cross_entropy(out, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return out @@ -142,7 +142,7 @@ def __init__(self, in_size, out_size): def forward(self, x): y = self._linear(x) z = self._linear(y) - loss = fluid.layers.mean(z) + loss = paddle.mean(z) return z, loss @@ -160,7 +160,7 @@ def __init__(self, in_size, out_size): def forward(self, x, y): x_out = self._linear1(x) y_out = self._linear2(y) - loss = fluid.layers.mean(x_out + y_out) + loss = paddle.mean(x_out + y_out) return x_out, y_out, loss @@ -176,7 +176,7 @@ def __init__(self, in_size, out_size): def forward(self, x, y): x_out = self._linear1(x) y_out = self._linear2(y) - loss = fluid.layers.mean(x_out + y_out) + loss = paddle.mean(x_out + y_out) return x_out, y_out, loss @@ -208,7 +208,7 @@ def __init__(self, in_size, out_size): def forward(self, x): y = self._linear_1(x) z = self._linear_2(y) - loss = fluid.layers.mean(z) + loss = paddle.mean(z) return y, loss @@ -224,7 +224,7 @@ def forward(self, x): y = self._linear_1(x) z = self._linear_2(y) out = y + z - loss = fluid.layers.mean(out) + loss = paddle.mean(out) return y, [(z, loss), out] @@ -316,7 +316,7 @@ def train(layer, input_size=784, label_size=1): cost = layer(img) loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) avg_loss.backward() sgd.minimize(avg_loss) diff --git a/python/paddle/fluid/tests/unittests/test_lambv2_op.py b/python/paddle/fluid/tests/unittests/test_lambv2_op.py index cde23216c1093..6ae2dbfb590bd 100644 --- a/python/paddle/fluid/tests/unittests/test_lambv2_op.py +++ b/python/paddle/fluid/tests/unittests/test_lambv2_op.py @@ -125,7 +125,7 @@ def _build_static_model(main, startup, seed=100): y = fluid.layers.data(name='Y', shape=[1], dtype='float32') prediction = fluid.layers.fc(input=x, size=1, act=None) loss = fluid.layers.square_error_cost(input=prediction, label=y) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss place = fluid.CPUPlace() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index aead014e7abb1..551ba3ffb542b 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -2970,7 +2970,7 @@ def make_fit_a_line(self): y_predict = layers.fc(input=x, size=1, act=None) y = self._get_data(name='y', shape=[1], dtype='float32') cost = layers.square_error_cost(input=y_predict, label=y) - avg_cost = layers.mean(cost) + avg_cost = paddle.mean(cost) return (avg_cost) def make_recognize_digits_mlp(self): @@ -2986,7 +2986,7 @@ def make_recognize_digits_mlp(self): act='softmax', param_attr=["sftmax.w1", "sftmax.w2"]) cost = layers.cross_entropy(input=predict, label=label) - avg_cost = layers.mean(cost) + avg_cost = paddle.mean(cost) return (avg_cost) def make_conv2d_transpose(self): @@ -3019,7 +3019,7 @@ def make_recognize_digits_conv(self): predict = layers.fc(input=conv_pool_2, size=10, act="softmax") cost = layers.cross_entropy(input=predict, label=label) - avg_cost = layers.mean(cost) + avg_cost = paddle.mean(cost) return avg_cost def make_word_embedding(self): @@ -3062,7 +3062,7 @@ def make_word_embedding(self): size=dict_size, act='softmax') cost = layers.cross_entropy(input=predict_word, label=next_word) - avg_cost = layers.mean(cost) + avg_cost = paddle.mean(cost) return (avg_cost) def make_sigmoid_cross_entropy(self): @@ -3235,7 +3235,7 @@ def make_nce(self): num_total_classes=dict_size, param_attr='nce.w', bias_attr='nce.b') - avg_loss = layers.mean(loss) + avg_loss = paddle.mean(loss) return (avg_loss) def make_multiplex(self): diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 0d328034ab7ea..15e3d806bb5f0 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -39,7 +39,7 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id): # loss function cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) # optimizer sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) @@ -73,7 +73,7 @@ def run_pserver_with_empty_block(use_cuda, sync_mode, ip, port, trainers, # loss function cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) # optimizer sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py index 32029e561d0ba..0005ccb4ab6a6 100644 --- a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py +++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py @@ -48,7 +48,7 @@ def static_train_net(img, label): prediction = convolutional_neural_network(img) loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) optimizer = fluid.optimizer.SGD(learning_rate=0.001) optimizer.minimize(avg_loss) diff --git a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py index 2911e7a6b71af..9843410bf7679 100644 --- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py +++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py @@ -15,6 +15,7 @@ from __future__ import print_function import unittest +import paddle import paddle.fluid.core as core import numpy import paddle.fluid.layers as layers @@ -191,7 +192,7 @@ def test_grad(self): array = lod_tensor_to_array(x, table) result = array_to_lod_tensor(array, table) - mean = layers.mean(result) + mean = paddle.mean(result) append_backward(mean) diff --git a/python/paddle/fluid/tests/unittests/test_lookahead.py b/python/paddle/fluid/tests/unittests/test_lookahead.py index efbc28cfa6cea..cc6977bb28420 100644 --- a/python/paddle/fluid/tests/unittests/test_lookahead.py +++ b/python/paddle/fluid/tests/unittests/test_lookahead.py @@ -42,7 +42,7 @@ def test_lookahead_static(self): with fluid.unique_name.guard(): data = fluid.data(name='X', shape=[None, 1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) + loss = paddle.mean(hidden) optimizer = paddle.optimizer.SGD(learning_rate=SGD_LR) lookahead = paddle.incubate.optimizer.LookAhead( diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py index eed0530e76113..5da93ebb798ee 100644 --- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py @@ -222,7 +222,7 @@ def get_w_grad(self, is_sparse): y = fluid.layers.reduce_sum(emb, dim=-1) loss = fluid.layers.square_error_cost(input=y, label=y_) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-4) sgd_optimizer.minimize(loss) diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py index af15f271b4a70..6b7a47febb835 100644 --- a/python/paddle/fluid/tests/unittests/test_mean_op.py +++ b/python/paddle/fluid/tests/unittests/test_mean_op.py @@ -42,7 +42,7 @@ class TestMeanOp(OpTest): def setUp(self): self.op_type = "mean" - self.python_api = fluid.layers.mean + self.python_api = paddle.mean self.dtype = np.float64 self.init_dtype_type() self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)} @@ -64,12 +64,12 @@ def test_errors(self): with program_guard(Program(), Program()): # The input type of mean_op must be Variable. input1 = 12 - self.assertRaises(TypeError, fluid.layers.mean, input1) + self.assertRaises(TypeError, paddle.mean, input1) # The input dtype of mean_op must be float16, float32, float64. input2 = fluid.layers.data(name='input2', shape=[12, 10], dtype="int32") - self.assertRaises(TypeError, fluid.layers.mean, input2) + self.assertRaises(TypeError, paddle.mean, input2) input3 = fluid.layers.data(name='input3', shape=[4], dtype="float16") @@ -96,7 +96,7 @@ def test_checkout_grad(self): x_np = np.random.random((10, 10)).astype(self.dtype) x = paddle.to_tensor(x_np) x.stop_gradient = False - y = fluid.layers.mean(x) + y = paddle.mean(x) dx = paddle.grad(y, x)[0].numpy() dx_expected = self.dtype(1.0 / np.prod(x_np.shape)) * np.ones( x_np.shape).astype(self.dtype) diff --git a/python/paddle/fluid/tests/unittests/test_memory_usage.py b/python/paddle/fluid/tests/unittests/test_memory_usage.py index adc3cd0a8442e..bdce84cfcb658 100644 --- a/python/paddle/fluid/tests/unittests/test_memory_usage.py +++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py @@ -29,7 +29,7 @@ def train_simulator(test_batch_size=10): y = fluid.layers.data(name='y', shape=[1], dtype='float32') cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py index 650b6a9a247d5..4b2a849b8b035 100644 --- a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py +++ b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py @@ -33,7 +33,7 @@ def loss_net(hidden, label): prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) return avg_loss diff --git a/python/paddle/fluid/tests/unittests/test_modelaverage.py b/python/paddle/fluid/tests/unittests/test_modelaverage.py index 7bb1e7d2e7a27..73a8bf0247c55 100644 --- a/python/paddle/fluid/tests/unittests/test_modelaverage.py +++ b/python/paddle/fluid/tests/unittests/test_modelaverage.py @@ -38,7 +38,7 @@ def test_model_average_static(self): with fluid.unique_name.guard(): data = fluid.data(name='X', shape=[None, 1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) + loss = paddle.mean(hidden) test_program = train_program.clone() optimizer = paddle.optimizer.Momentum(learning_rate=0.2, momentum=0.1) diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index 0b6bd99e6592f..949bb2fb3250d 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -517,7 +517,7 @@ def test_momentum(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9) @@ -658,7 +658,7 @@ def test_momentum_static(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum( learning_rate=0.1, momentum=0.9) @@ -987,7 +987,7 @@ def _momentum_optimize_static(self, name='X', dtype='float32') hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.fluid.layers.mean(hidden) + loss = paddle.mean(hidden) optimizer.minimize(loss) exe.run(startup_program) if use_amp: diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py index bbeec5ce62111..7418714b23b8d 100644 --- a/python/paddle/fluid/tests/unittests/test_nce.py +++ b/python/paddle/fluid/tests/unittests/test_nce.py @@ -16,7 +16,7 @@ import numpy as np import unittest - +import paddle import paddle.fluid as fluid import paddle.fluid.initializer as initializer from paddle.fluid import Program, program_guard @@ -192,7 +192,7 @@ def train_network(self, num_total_classes, num_neg_samples, sampler, seed=1, num_neg_samples=num_neg_samples, is_sparse=is_sparse) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) # optimizer optimizer = self.get_optimizer() optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py index 7f230164d6027..64f9f14f94ed0 100644 --- a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py +++ b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py @@ -39,7 +39,7 @@ def run_net_on_place(self, place): y = fluid.layers.data(name='y', shape=[1], dtype=self.dtype) y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py index 170b916941d36..c43fcc51a816f 100644 --- a/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_nn_sigmoid_op.py @@ -16,6 +16,7 @@ import unittest import numpy as np +import paddle import paddle.fluid.core as core from op_test import OpTest from scipy.special import expit, erf diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py index 30cfa9f17ebcc..65d09ebff51d1 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py @@ -122,7 +122,7 @@ def cond_false(): cond_i = fluid.layers.assign(np.array([cond_i], dtype='float32')) sum_cond = fluid.layers.cond(cond_i > 1.0, cond_true, cond_false) sum_all = fluid.layers.sum([sum_xy, sub_yz, sum_cond]) - mean_out = fluid.layers.mean(sum_all) + mean_out = paddle.mean(sum_all) if use_bf16: import paddle.static.amp as amp self.optimizer = amp.bf16.decorate_bf16( diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py index 40afe9248bf9b..4331ea8ff3136 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py @@ -72,7 +72,7 @@ def double_fc_net(image): def fn_1(opt, avg_loss=None, pred=None, label=None): if avg_loss is None: loss = layers.cross_entropy(input=pred, label=label) - avg_loss = layers.mean(loss, name='mean_cross_entropy_loss') + avg_loss = paddle.mean(loss, name='mean_cross_entropy_loss') opt.minimize(avg_loss) return avg_loss @@ -80,7 +80,7 @@ def fn_2(opt, avg_loss=None, pred=None, label=None): if avg_loss is None: loss = layers.softmax_with_cross_entropy(logits=pred, label=label) - avg_loss = layers.mean(loss, name='mean_softmax_loss') + avg_loss = paddle.mean(loss, name='mean_softmax_loss') opt.minimize(avg_loss) return avg_loss @@ -101,10 +101,10 @@ def fn_2(opt, avg_loss=None, pred=None, label=None): lambda: fn_2(sgd, None, prediction, label)) else: loss_1 = layers.cross_entropy(input=prediction, label=label) - avg_loss_1 = layers.mean(loss_1) + avg_loss_1 = paddle.mean(loss_1) loss_2 = layers.softmax_with_cross_entropy(logits=prediction, label=label) - avg_loss_2 = layers.mean(loss_2) + avg_loss_2 = paddle.mean(loss_2) avg_loss = layers.case([(mod_two, lambda: fn_1(adam, avg_loss_1))], lambda: fn_2(sgd, avg_loss_2)) @@ -174,13 +174,13 @@ def dynamic(train_data, use_cuda=False, use_parallel_exe=False): if epoch % 2 == 0: cross_entropy_loss = layers.cross_entropy(prediction, var_label) - loss = layers.mean(cross_entropy_loss) + loss = paddle.mean(cross_entropy_loss) loss.backward() adam.minimize(loss) else: softmax_loss = layers.softmax_with_cross_entropy( prediction, var_label) - loss = layers.mean(softmax_loss) + loss = paddle.mean(softmax_loss) loss.backward() sgd.minimize(loss) @@ -247,7 +247,7 @@ def fn_2(opt, avg_loss): x = fluid.layers.data("X", [10], 'float32') hidden = layers.fc(x, 5) - avg_loss = layers.mean(hidden) + avg_loss = paddle.mean(hidden) adam = optimizer.Adam(learning_rate=LR) sgd = optimizer.SGD(learning_rate=LR) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index c81a38019956f..83017f49e505c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -160,7 +160,7 @@ def check_network_convergence(self, input=feature_out, label=target, param_attr=fluid.ParamAttr(name='crfw', learning_rate=1e-1)) - avg_cost = fluid.layers.mean(crf_cost) + avg_cost = paddle.mean(crf_cost) sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py index 7618371036b12..2716a38d89399 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py @@ -15,6 +15,7 @@ from __future__ import print_function import unittest +import paddle import paddle.fluid as fluid import numpy import os @@ -33,7 +34,7 @@ def check_drop_scope(self, use_cuda=True): with fluid.program_guard(train_program, startup_program): data = fluid.layers.data(name='X', shape=[1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) + loss = paddle.mean(hidden) test_program = fluid.default_main_program().clone(for_test=True) fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index aefa635508db0..9cac242a7ba7b 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle import paddle.fluid as fluid from paddle.fluid import compiler import unittest @@ -74,7 +75,7 @@ def network_func(): hidden = fluid.layers.fc(input=img, size=200, act='tanh') prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - avg_loss = fluid.layers.mean(loss) + avg_loss = paddle.mean(loss) fluid.optimizer.Adam().minimize(avg_loss) return avg_loss diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py index 0c3c293f7b9c3..7321327372d87 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py @@ -15,6 +15,7 @@ from __future__ import print_function import math +import paddle import paddle.fluid as fluid from paddle.fluid import compiler import paddle.fluid.core as core @@ -59,7 +60,7 @@ def parallel_exe(self, label = fluid.layers.data(name='label', shape=[1], dtype='int64') out = Lenet(data, class_dim=102) loss = fluid.layers.cross_entropy(input=out, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) opt = fluid.optimizer.Momentum( learning_rate=0.1, momentum=0.9, diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 2e2791351bfec..f2a753a9874ed 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -38,7 +38,7 @@ def simple_fc_net(use_feed): value=1.0))) prediction = fluid.layers.fc(hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss @@ -61,7 +61,7 @@ def fc_with_batchnorm(use_feed): prediction = fluid.layers.fc(hidden, size=10, act='softmax') with fluid.name_scope("loss"): loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py index 4f5cfba0c1ab3..e6b334600bc82 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_profiler.py @@ -59,7 +59,7 @@ def build_program(self, compile_program=True): predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') label = fluid.layers.data(name='y', shape=[1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py index b768aa7305158..cd00af1ed96da 100644 --- a/python/paddle/fluid/tests/unittests/test_program.py +++ b/python/paddle/fluid/tests/unittests/test_program.py @@ -16,6 +16,7 @@ import unittest from paddle.fluid.framework import Program, default_main_program, program_guard, grad_var_name +import paddle import paddle.fluid.layers as layers import paddle.fluid as fluid @@ -120,7 +121,7 @@ def net(): use_double_buffer=True) in_data, label = fluid.layers.read_file(reader) predict_label = fluid.layers.fc(in_data, size=2, act='softmax') - loss = fluid.layers.mean( + loss = paddle.mean( fluid.layers.cross_entropy(input=predict_label, label=label)) optimizer = fluid.optimizer.Adam() @@ -146,7 +147,7 @@ def test_program_all_parameters(self): program = fluid.default_main_program() data = fluid.data(name='x', shape=[None, 13], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) - loss = fluid.layers.mean(hidden) + loss = paddle.mean(hidden) fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) # NOTE: here the parameters are fc_0.w_0 and fc_0.b_0 @@ -182,7 +183,7 @@ def net(): use_double_buffer=True) in_data, label = fluid.layers.read_file(reader) predict_label = fluid.layers.fc(in_data, size=2, act='softmax') - loss = fluid.layers.mean( + loss = paddle.mean( fluid.layers.cross_entropy(input=predict_label, label=label)) optimizer = fluid.optimizer.Adam() diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py index c602cfb4ad0b3..0f4543bc934a4 100755 --- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py +++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py @@ -52,7 +52,7 @@ def lstm_net(use_feed): fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost @@ -70,7 +70,7 @@ def simple_fc_net_with_accuracy(use_feed): value=1.0))) prediction = fluid.layers.fc(hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) accuracy_out = fluid.layers.accuracy(input=prediction, label=label, k=5) return loss @@ -83,12 +83,12 @@ def cond_net(use_feed=None): def loss1(pred, label): x = fluid.layers.data(name="x", shape=[4], dtype='float32') loss = fluid.layers.cross_entropy(input=pred, label=label) - avg_loss = fluid.layers.mean(loss, name='mean_cross_entropy_loss') + avg_loss = paddle.mean(loss, name='mean_cross_entropy_loss') return avg_loss def loss2(pred, label): loss = fluid.layers.softmax_with_cross_entropy(logits=pred, label=label) - avg_loss = fluid.layers.mean(loss, name='mean_softmax_loss') + avg_loss = paddle.mean(loss, name='mean_softmax_loss') return avg_loss two = fluid.layers.fill_constant([1], 'int32', 2) @@ -106,14 +106,14 @@ def optimization_in_cond_net(with_optimize=False): def loss1(opt, pred, label, with_optimize): x = fluid.layers.data(name="x", shape=[4], dtype='float32') loss = fluid.layers.cross_entropy(input=pred, label=label) - avg_loss = fluid.layers.mean(loss, name='mean_cross_entropy_loss') + avg_loss = paddle.mean(loss, name='mean_cross_entropy_loss') if with_optimize: opt.minimize(avg_loss) return avg_loss def loss2(opt, pred, label, with_optimize): loss = fluid.layers.softmax_with_cross_entropy(logits=pred, label=label) - avg_loss = fluid.layers.mean(loss, name='mean_softmax_loss') + avg_loss = paddle.mean(loss, name='mean_softmax_loss') if with_optimize: opt.minimize(avg_loss) return avg_loss diff --git a/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py index 71b07155f4015..c9a7317bfff3b 100644 --- a/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py +++ b/python/paddle/fluid/tests/unittests/test_prroi_pool_op.py @@ -18,6 +18,7 @@ import unittest from py_precise_roi_pool import PyPrRoIPool from op_test import OpTest +import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid import compiler, Program, program_guard @@ -103,7 +104,7 @@ def run_net(self, place): dtype="float32", lod_level=1) output = fluid.layers.prroi_pool(x, rois, 0.25, 2, 2) - loss = fluid.layers.mean(output) + loss = paddle.mean(output) optimizer = fluid.optimizer.SGD(learning_rate=1e-3) optimizer.minimize(loss) input_x = fluid.create_lod_tensor(self.x, [], place) @@ -234,7 +235,7 @@ def run_net(self, place): 2, 2, batch_roi_nums=rois_index) - loss = fluid.layers.mean(output) + loss = paddle.mean(output) optimizer = fluid.optimizer.SGD(learning_rate=1e-3) optimizer.minimize(loss) exe = fluid.Executor(place) diff --git a/python/paddle/fluid/tests/unittests/test_prune.py b/python/paddle/fluid/tests/unittests/test_prune.py index c320e3fbf58b2..730a6c1b8a8ff 100644 --- a/python/paddle/fluid/tests/unittests/test_prune.py +++ b/python/paddle/fluid/tests/unittests/test_prune.py @@ -16,6 +16,7 @@ import unittest +import paddle import paddle.fluid as fluid import paddle.fluid.framework as framework import paddle.compat as cpt @@ -31,7 +32,7 @@ def net(self): label = fluid.layers.data(name="label", shape=[1], dtype="int64") y = fluid.layers.fc(input=[x], size=2, act="softmax") loss = fluid.layers.cross_entropy(input=y, label=label) - loss = fluid.layers.mean(x=loss) + loss = paddle.mean(x=loss) return x, y, label, loss def test_prune_with_input(self): @@ -41,14 +42,14 @@ def test_prune_with_input(self): with fluid.program_guard(program, startup_program): (x, y, label, loss) = self.net() self.assertEqual(len(block.ops), 5) - self.assertEqual( - [op.type for op in block.ops], - ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"]) + self.assertEqual([op.type for op in block.ops], [ + "mul", "elementwise_add", "softmax", "cross_entropy2", "reduce_mean" + ]) pruned_program = program._prune_with_input( feeded_var_names=[y.name, label.name], targets=[loss]) self.assertEqual(len(pruned_program.global_block().ops), 2) self.assertEqual([op.type for op in pruned_program.global_block().ops], - ["cross_entropy2", "mean"]) + ["cross_entropy2", "reduce_mean"]) def test_prune(self): program = framework.Program() @@ -57,14 +58,16 @@ def test_prune(self): with fluid.program_guard(program, startup_program): (x, y, label, loss) = self.net() self.assertEqual(len(block.ops), 5) - self.assertEqual( - [op.type for op in block.ops], - ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"]) + self.assertEqual([op.type for op in block.ops], [ + "mul", "elementwise_add", "softmax", "cross_entropy2", "reduce_mean" + ]) pruned_program = program._prune(targets=[loss]) self.assertEqual(len(pruned_program.global_block().ops), 5) - self.assertEqual( - [op.type for op in pruned_program.global_block().ops], - ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"]) + self.assertEqual([op.type for op in pruned_program.global_block().ops], + [ + "mul", "elementwise_add", "softmax", + "cross_entropy2", "reduce_mean" + ]) def test_prune_target_not_list(self): program = framework.Program() @@ -73,14 +76,16 @@ def test_prune_target_not_list(self): with fluid.program_guard(program, startup_program): (x, y, label, loss) = self.net() self.assertEqual(len(block.ops), 5) - self.assertEqual( - [op.type for op in block.ops], - ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"]) + self.assertEqual([op.type for op in block.ops], [ + "mul", "elementwise_add", "softmax", "cross_entropy2", "reduce_mean" + ]) pruned_program = program._prune(targets=loss) self.assertEqual(len(pruned_program.global_block().ops), 5) - self.assertEqual( - [op.type for op in pruned_program.global_block().ops], - ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"]) + self.assertEqual([op.type for op in pruned_program.global_block().ops], + [ + "mul", "elementwise_add", "softmax", + "cross_entropy2", "reduce_mean" + ]) def test_prune_target_none(self): program = framework.Program() @@ -89,9 +94,9 @@ def test_prune_target_none(self): with fluid.program_guard(program, startup_program): (x, y, label, loss) = self.net() self.assertEqual(len(block.ops), 5) - self.assertEqual( - [op.type for op in block.ops], - ["mul", "elementwise_add", "softmax", "cross_entropy2", "mean"]) + self.assertEqual([op.type for op in block.ops], [ + "mul", "elementwise_add", "softmax", "cross_entropy2", "reduce_mean" + ]) try: pruned_program = program._prune(targets=None) except ValueError as e: @@ -128,9 +133,9 @@ def net1(self): act="softmax", param_attr=w_param_attrs) loss1 = fluid.layers.cross_entropy(input=y, label=label) - loss1 = fluid.layers.mean(x=loss1) + loss1 = paddle.mean(x=loss1) loss2 = fluid.layers.cross_entropy(input=y, label=label) - loss2 = fluid.layers.mean(x=loss2) + loss2 = paddle.mean(x=loss2) loss1.persistable = True loss2.persistable = True return x, y, label, loss1, loss2, w_param_attrs @@ -158,9 +163,9 @@ def net2(self): act="softmax", param_attr=w2_param_attrs) loss1 = fluid.layers.cross_entropy(input=y1, label=label) - loss1 = fluid.layers.mean(x=loss1) + loss1 = paddle.mean(x=loss1) loss2 = fluid.layers.cross_entropy(input=y2, label=label) - loss2 = fluid.layers.mean(x=loss2) + loss2 = paddle.mean(x=loss2) return x1, x2, y1, y2, label, loss1, loss2, w1_param_attrs, w2_param_attrs def test_not_prune(self): diff --git a/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py b/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py index b15edb44d57a8..d700966126a2e 100644 --- a/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py +++ b/python/paddle/fluid/tests/unittests/test_pull_gpups_sparse_op.py @@ -43,7 +43,7 @@ def test_static_graph(self): size=[11], is_distributed=True, is_sparse=True) - cost = paddle.fluid.layers.mean(output) + cost = paddle.mean(output) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(cost, train_program) block = train_program.global_block() diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py index f0f791d62a7a2..0eaf4b453bdaa 100644 --- a/python/paddle/fluid/tests/unittests/test_py_func_op.py +++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py @@ -127,7 +127,7 @@ def simple_fc_net(img, label, use_py_func_op): assert loss == loss_out and dummy_var == dummy_var_out, \ "py_func failed with multi input and output" - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py index 4be5a4ae94860..830ade004d3a6 100644 --- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py +++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py @@ -119,7 +119,7 @@ def simple_fc_net(in_size, value=1.0))) predict_label = fluid.layers.fc(hidden, size=class_num, act='softmax') - loss = fluid.layers.mean( + loss = paddle.mean( fluid.layers.cross_entropy(input=predict_label, label=label)) optimizer = fluid.optimizer.Adam() diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py index 568d57c09355f..0acd0ac398e34 100644 --- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py +++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py @@ -15,6 +15,7 @@ from __future__ import print_function import unittest +import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers import numpy as np @@ -134,7 +135,7 @@ def setUp(self): self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): - self.output = layers.mean(self.create_rnn_op()) + self.output = paddle.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], @@ -262,7 +263,7 @@ def setUp(self): self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): - self.output = layers.mean(self.create_rnn_op()) + self.output = paddle.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], @@ -364,7 +365,7 @@ def setUp(self): self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): - self.output = layers.mean(self.create_rnn_op()) + self.output = paddle.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], @@ -446,7 +447,7 @@ def setUp(self): self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): - self.output = layers.mean(self.create_rnn_op()) + self.output = paddle.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], @@ -553,7 +554,7 @@ def setUp(self): with fluid.program_guard(self.main_program, self.startup_program): rnn_out = self.create_rnn_op() - self.output = layers.mean(rnn_out) + self.output = paddle.mean(rnn_out) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], @@ -637,7 +638,7 @@ def setUp(self): self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape) with fluid.program_guard(self.main_program, self.startup_program): - self.output = layers.mean(self.create_rnn_op()) + self.output = paddle.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim], diff --git a/python/paddle/fluid/tests/unittests/test_registry.py b/python/paddle/fluid/tests/unittests/test_registry.py index e9f847185fc76..8d803635aa35c 100644 --- a/python/paddle/fluid/tests/unittests/test_registry.py +++ b/python/paddle/fluid/tests/unittests/test_registry.py @@ -15,6 +15,7 @@ from __future__ import print_function import unittest +import paddle import paddle.fluid as fluid import numpy as np from decorator_helper import prog_scope @@ -25,7 +26,7 @@ class TestRegistry(unittest.TestCase): @prog_scope() def test_registry_layer(self): x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32') - output = fluid.layers.mean(x) + output = paddle.mean(x) place = fluid.CPUPlace() exe = fluid.Executor(place) diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py index 304e47da9a61a..4a48b6fb1f838 100644 --- a/python/paddle/fluid/tests/unittests/test_regularizer.py +++ b/python/paddle/fluid/tests/unittests/test_regularizer.py @@ -147,7 +147,7 @@ def bow_net(data, fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost diff --git a/python/paddle/fluid/tests/unittests/test_regularizer_api.py b/python/paddle/fluid/tests/unittests/test_regularizer_api.py index da2643cc64726..fc46c9c93c37e 100644 --- a/python/paddle/fluid/tests/unittests/test_regularizer_api.py +++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py @@ -50,7 +50,7 @@ def bow_net(data, fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py index 42f32f2e75bd8..eb192fcde6fac 100644 --- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py +++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py @@ -246,7 +246,7 @@ def test_rmsprop(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) rms_optimizer = paddle.optimizer.RMSProp(learning_rate=0.1) rms_optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_select_input_output_op.py b/python/paddle/fluid/tests/unittests/test_select_input_output_op.py index 8a41e05d1d52a..c809c973438eb 100644 --- a/python/paddle/fluid/tests/unittests/test_select_input_output_op.py +++ b/python/paddle/fluid/tests/unittests/test_select_input_output_op.py @@ -16,6 +16,7 @@ import unittest import numpy as np +import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.layers as layers @@ -43,7 +44,7 @@ def test_forward_backward_list_output(self): select_output(x, outputs, mask) y = select_input(outputs, mask) - mean = layers.mean(y) + mean = paddle.mean(y) append_backward(mean) place = fluid.CUDAPlace( diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py index 8e00d905a3520..8f4f5dad074f0 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py @@ -206,7 +206,7 @@ def runTest(self): out = fluid.layers.l2_normalize(x=emb, axis=-1) cost = fluid.layers.square_error_cost(input=out, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) @@ -368,7 +368,7 @@ def static_sgd_mp(self, mp): name='X', dtype='float32') hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.fluid.layers.mean(hidden) + loss = paddle.mean(hidden) optimizer.minimize(loss) exe.run(startup_program) @@ -470,7 +470,7 @@ def static_sgd_mp(self, mp): name='X', dtype='float32') hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.fluid.layers.mean(hidden) + loss = paddle.mean(hidden) optimizer.minimize(loss) exe.run(startup_program) diff --git a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py index daa3f191ccd72..c3cb57f9438f1 100644 --- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py +++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py @@ -15,6 +15,7 @@ from __future__ import print_function import unittest +import paddle import paddle.fluid.core as core from paddle.fluid.executor import Executor import paddle.fluid.layers as layers @@ -47,7 +48,7 @@ def setUp(self): i = layers.increment(x=i) i.stop_gradient = True self.mem3 = shrink_memory(x=self.mem2, i=i, table=table) - mem3_mean = layers.mean(self.mem3) + mem3_mean = paddle.mean(self.mem3) append_backward(loss=mem3_mean) self.x_grad = self.main_program.global_block().var('x@GRAD') diff --git a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py index 8f2380845875a..e027401549a01 100644 --- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py @@ -16,6 +16,7 @@ import unittest from paddle.fluid import Program, program_guard +import paddle import paddle.fluid.core as core import numpy as np import paddle.fluid.layers as layers @@ -195,7 +196,7 @@ def test_grad(self): mask=y, x=x, level=level) - mean = layers.mean(out) + mean = paddle.mean(out) append_backward(mean) diff --git a/python/paddle/fluid/tests/unittests/test_trainable.py b/python/paddle/fluid/tests/unittests/test_trainable.py index 72edff9f29b34..546ab11466db7 100644 --- a/python/paddle/fluid/tests/unittests/test_trainable.py +++ b/python/paddle/fluid/tests/unittests/test_trainable.py @@ -16,6 +16,7 @@ from collections import Counter import unittest +import paddle import paddle.fluid as fluid from simple_nets import init_data @@ -27,7 +28,7 @@ def test_trainable(): size=10, param_attr=fluid.ParamAttr(trainable=False)) loss = fluid.layers.cross_entropy(input=feature, label=label) - loss = fluid.layers.mean(loss) + loss = paddle.mean(loss) return loss diff --git a/python/paddle/fluid/tests/unittests/test_weight_decay.py b/python/paddle/fluid/tests/unittests/test_weight_decay.py index b42bfb1a684ac..7cdfc3da93048 100644 --- a/python/paddle/fluid/tests/unittests/test_weight_decay.py +++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py @@ -63,7 +63,7 @@ def bow_net(data, fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) + avg_cost = paddle.mean(x=cost) return avg_cost diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py index 51cb380be8438..967f917fd93b7 100644 --- a/python/paddle/fluid/tests/unittests/test_where_op.py +++ b/python/paddle/fluid/tests/unittests/test_where_op.py @@ -97,7 +97,7 @@ def test_api(self, use_cuda=False): x.stop_gradient = x_stop_gradient y.stop_gradient = y_stop_gradient result = paddle.where(cond, x, y) - append_backward(layers.mean(result)) + append_backward(paddle.mean(result)) for use_cuda in [False, True]: if (use_cuda and (not fluid.core.is_compiled_with_cuda())): diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py index baf111df6335a..92d67406b033a 100644 --- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py +++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py @@ -222,7 +222,7 @@ def body(i, x): x.stop_gradient = False out = layers.while_loop(cond, body, [i, x]) - mean = layers.mean(out[1]) + mean = paddle.mean(out[1]) append_backward(mean) place = fluid.CUDAPlace( @@ -264,7 +264,7 @@ def body(i, x): x.stop_gradient = False out = layers.while_loop(cond, body, [i, x]) - mean = layers.mean(out[1]) + mean = paddle.mean(out[1]) append_backward(mean) place = fluid.CUDAPlace( @@ -351,7 +351,7 @@ def internal_body(j, x, mem_array): [i, j, x, mem_array]) sum_result = layers.array_read(array=mem_array, i=j) - mean = layers.mean(sum_result) + mean = paddle.mean(sum_result) append_backward(mean) place = fluid.CUDAPlace( diff --git a/python/paddle/fluid/tests/unittests/test_while_op.py b/python/paddle/fluid/tests/unittests/test_while_op.py index dee83692bd324..8e35a57f2426f 100644 --- a/python/paddle/fluid/tests/unittests/test_while_op.py +++ b/python/paddle/fluid/tests/unittests/test_while_op.py @@ -81,7 +81,7 @@ def simple_net(self): layers.array_write(result2, i=j, array=mem_array) layers.less_than(x=j, y=array_len2, cond=cond2) sum_result = layers.array_read(array=mem_array, i=j) - loss = layers.mean(sum_result) + loss = paddle.mean(sum_result) return loss, sum_result def test_simple_net(self): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py index b79bbafb37554..648e87f8c3174 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py @@ -58,7 +58,7 @@ def test_assign_LoDTensorArray(self): init_array = fluid.layers.array_write(x=z, i=i) array = fluid.layers.assign(init_array) sums = fluid.layers.array_read(array=init_array, i=i) - mean = fluid.layers.mean(sums) + mean = paddle.mean(sums) append_backward(mean) place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py index 0ddc38dbceba6..892a5b6840ab9 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py @@ -59,12 +59,12 @@ def test_errors(self): with program_guard(Program(), Program()): # The input type of mean_op must be Variable. input1 = 12 - self.assertRaises(TypeError, fluid.layers.mean, input1) + self.assertRaises(TypeError, paddle.mean, input1) # The input dtype of mean_op must be float16, float32, float64. input2 = fluid.layers.data(name='input2', shape=[12, 10], dtype="int32") - self.assertRaises(TypeError, fluid.layers.mean, input2) + self.assertRaises(TypeError, paddle.mean, input2) input3 = fluid.layers.data(name='input3', shape=[4], dtype="float16") diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py index 2e8853de44a9a..9f7ca522d742b 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py @@ -249,7 +249,7 @@ def test_rmsprop(self): y = fluid.layers.data(name='y', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) print(avg_cost.shape) linear = paddle.nn.Linear(13, 5) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py index e174d24533215..8953773d8cde4 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py @@ -79,7 +79,7 @@ def runTest(self): out = fluid.layers.l2_normalize(x=emb, axis=-1) cost = fluid.layers.square_error_cost(input=out, label=label) - avg_cost = fluid.layers.mean(cost) + avg_cost = paddle.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py index ad22ab86b932f..433ab36e1936b 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py @@ -115,7 +115,7 @@ def test_api(self): y.stop_gradient = y_stop_gradient result = paddle.where(cond, x, y) - append_backward(fluid.layers.mean(result)) + append_backward(paddle.mean(result)) exe = fluid.Executor(self.place) exe.run(startup) From 75c975f0b447fc1cd8f9d033c4a4a4e496c69b72 Mon Sep 17 00:00:00 2001 From: Leo Guo <58431564+ZibinGuo@users.noreply.github.com> Date: Tue, 5 Jul 2022 17:32:30 +0800 Subject: [PATCH 060/250] Modify the unittests of the assign_value, iou_similarity, one_hot_v2, reduce_mean, roi_align op. test=kunlun (#44061) --- cmake/external/xpu.cmake | 14 +- .../unittests/xpu/test_assign_value_op_xpu.py | 135 ++++++ .../xpu/test_iou_similarity_op_xpu.py | 191 ++++---- .../unittests/xpu/test_one_hot_v2_op_xpu.py | 189 ++++---- .../unittests/xpu/test_reduce_mean_op_xpu.py | 344 +++++++-------- .../unittests/xpu/test_roi_align_op_xpu.py | 412 +++++++++--------- 6 files changed, 715 insertions(+), 570 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 7d1cca4feb6a6..d75af71203bf9 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220601") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220703") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() @@ -19,14 +19,14 @@ endif() if(NOT DEFINED XPU_XDNN_BASE_URL) set(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220601") + set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220703") else() set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() if(WITH_AARCH64) set(XPU_XRE_DIR_NAME "xre-kylin_aarch64") - set(XPU_XDNN_DIR_NAME "XDNN-kylin_aarch64") + set(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64") set(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64") set(XPU_XDNN_URL "${XPU_XDNN_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" @@ -40,7 +40,7 @@ elseif(WITH_SUNWAY) CACHE STRING "" FORCE) elseif(WITH_BDCENTOS) set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64") - set(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") + set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64") set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") # ubuntu and centos: use output by XDNN API team set(XPU_XDNN_URL @@ -48,7 +48,7 @@ elseif(WITH_BDCENTOS) CACHE STRING "" FORCE) elseif(WITH_UBUNTU) set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") - set(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64") + set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") # ubuntu and centos: use output by XDNN API team set(XPU_XDNN_URL @@ -56,7 +56,7 @@ elseif(WITH_UBUNTU) CACHE STRING "" FORCE) elseif(WITH_CENTOS) set(XPU_XRE_DIR_NAME "xre-centos7_x86_64") - set(XPU_XDNN_DIR_NAME "XDNN-bdcentos_x86_64") + set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64") set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") # ubuntu and centos: use output by XDNN API team set(XPU_XDNN_URL @@ -64,7 +64,7 @@ elseif(WITH_CENTOS) CACHE STRING "" FORCE) else() set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") - set(XPU_XDNN_DIR_NAME "XDNN-ubuntu_x86_64") + set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") # default: use output by XDNN API team set(XPU_XDNN_URL diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py new file mode 100644 index 0000000000000..6455b157cb2ca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py @@ -0,0 +1,135 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy +import sys + +sys.path.append("..") +import paddle.fluid as fluid +import paddle.fluid.framework as framework +import paddle.fluid.layers as layers +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper +import paddle + +paddle.enable_static() + + +class XPUTestAssignValueOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'assign_value' + self.use_dynamic_create_class = False + + class TestAssignValueOp(XPUOpTest): + + def init(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = 'assign_value' + + def setUp(self): + self.init() + self.inputs = {} + self.attrs = {} + self.init_data() + self.attrs["shape"] = self.value.shape + self.attrs["dtype"] = framework.convert_np_dtype_to_dtype_( + self.value.dtype) + self.outputs = {"Out": self.value} + + def init_data(self): + self.value = numpy.random.random(size=(2, 5)).astype(self.dtype) + self.attrs["fp32_values"] = [float(v) for v in self.value.flat] + + def test_forward(self): + self.check_output_with_place(self.place) + + class TestAssignValueOp2(TestAssignValueOp): + + def init_data(self): + self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32) + self.attrs["int32_values"] = [int(v) for v in self.value.flat] + + class TestAssignValueOp3(TestAssignValueOp): + + def init_data(self): + self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64) + self.attrs["int64_values"] = [int(v) for v in self.value.flat] + + class TestAssignValueOp4(TestAssignValueOp): + + def init_data(self): + self.value = numpy.random.choice(a=[False, True], + size=(2, 5)).astype(numpy.bool) + self.attrs["bool_values"] = [int(v) for v in self.value.flat] + + +class TestAssignApi(unittest.TestCase): + + def setUp(self): + self.init_dtype() + self.value = (-100 + 200 * numpy.random.random(size=(2, 5))).astype( + self.dtype) + self.place = fluid.XPUPlace(0) + + def init_dtype(self): + self.dtype = "float32" + + def test_assign(self): + main_program = fluid.Program() + with fluid.program_guard(main_program): + x = layers.create_tensor(dtype=self.dtype) + layers.assign(input=self.value, output=x) + + exe = fluid.Executor(self.place) + [fetched_x] = exe.run(main_program, feed={}, fetch_list=[x]) + self.assertTrue(numpy.array_equal(fetched_x, self.value), + "fetch_x=%s val=%s" % (fetched_x, self.value)) + self.assertEqual(fetched_x.dtype, self.value.dtype) + + +class TestAssignApi2(TestAssignApi): + + def init_dtype(self): + self.dtype = "int32" + + +class TestAssignApi3(TestAssignApi): + + def init_dtype(self): + self.dtype = "int64" + + +class TestAssignApi4(TestAssignApi): + + def setUp(self): + self.init_dtype() + self.value = numpy.random.choice(a=[False, True], + size=(2, 5)).astype(numpy.bool) + self.place = fluid.XPUPlace(0) + + def init_dtype(self): + self.dtype = "bool" + + +support_types = get_xpu_op_support_types('assign_value') +for stype in support_types: + create_test_class(globals(), XPUTestAssignValueOp, stype) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py index ceb154f1e3520..56ad05505a3ac 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py @@ -22,99 +22,116 @@ import numpy as np import numpy.random as random import sys -import math -from op_test import OpTest + +sys.path.append("..") from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper import paddle paddle.enable_static() -class TestXPUIOUSimilarityOp(XPUOpTest): - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def setUp(self): - self.op_type = "iou_similarity" - self.boxes1 = random.rand(2, 4).astype('float32') - self.boxes2 = random.rand(3, 4).astype('float32') - self.output = random.rand(2, 3).astype('float32') - self.box_normalized = False - # run python iou computation - self._compute_iou() - self.inputs = {'X': self.boxes1, 'Y': self.boxes2} - self.attrs = {"box_normalized": self.box_normalized, 'use_xpu': True} - self.outputs = {'Out': self.output} - - def _compute_iou(self, ): - for row in range(self.boxes1.shape[0]): - for col in range(self.boxes2.shape[0]): - xmin1, ymin1, xmax1, ymax1 = self.boxes1[row] - xmin2, ymin2, xmax2, ymax2 = self.boxes2[col] - if not self.box_normalized: - area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1) - area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1) - else: - area1 = (ymax1 - ymin1) * (xmax1 - xmin1) - area2 = (ymax2 - ymin2) * (xmax2 - xmin2) - - inter_xmax = min(xmax1, xmax2) - inter_ymax = min(ymax1, ymax2) - inter_xmin = max(xmin1, xmin2) - inter_ymin = max(ymin1, ymin2) - inter_height = inter_ymax - inter_ymin - inter_width = inter_xmax - inter_xmin - if not self.box_normalized: - inter_height += 1 - inter_width += 1 - inter_height = max(inter_height, 0) - inter_width = max(inter_width, 0) - inter_area = inter_width * inter_height - union_area = area1 + area2 - inter_area - sim_score = inter_area / union_area - self.output[row, col] = sim_score - - -class TestXPUIOUSimilarityOpWithLoD(TestXPUIOUSimilarityOp): - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place, check_dygraph=False) - - def setUp(self): - super(TestXPUIOUSimilarityOpWithLoD, self).setUp() - self.boxes1_lod = [[1, 1]] - self.output_lod = [[1, 1]] - self.box_normalized = False - # run python iou computation - self._compute_iou() - self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2} - self.attrs = {"box_normalized": self.box_normalized} - self.outputs = {'Out': (self.output, self.output_lod)} - - -class TestXPUIOUSimilarityOpWithBoxNormalized(TestXPUIOUSimilarityOp): - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place, check_dygraph=False) - - def setUp(self): - super(TestXPUIOUSimilarityOpWithBoxNormalized, self).setUp() - self.boxes1_lod = [[1, 1]] - self.output_lod = [[1, 1]] - self.box_normalized = True - # run python iou computation - self._compute_iou() - self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2} - self.attrs = {"box_normalized": self.box_normalized} - self.outputs = {'Out': (self.output, self.output_lod)} - +class XPUTestIOUSimilarityOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'iou_similarity' + self.use_dynamic_create_class = False + + class TestXPUIOUSimilarityOp(XPUOpTest): + + def init(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = 'iou_similarity' + + def test_check_output(self): + self.check_output_with_place(self.place) + + def setUp(self): + self.init() + self.boxes1 = random.rand(2, 4).astype(self.dtype) + self.boxes2 = random.rand(3, 4).astype(self.dtype) + self.output = random.rand(2, 3).astype(self.dtype) + self.box_normalized = False + # run python iou computation + self._compute_iou() + self.inputs = {'X': self.boxes1, 'Y': self.boxes2} + self.attrs = { + "box_normalized": self.box_normalized, + 'use_xpu': True + } + self.outputs = {'Out': self.output} + + def _compute_iou(self, ): + for row in range(self.boxes1.shape[0]): + for col in range(self.boxes2.shape[0]): + xmin1, ymin1, xmax1, ymax1 = self.boxes1[row] + xmin2, ymin2, xmax2, ymax2 = self.boxes2[col] + if not self.box_normalized: + area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1) + area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1) + else: + area1 = (ymax1 - ymin1) * (xmax1 - xmin1) + area2 = (ymax2 - ymin2) * (xmax2 - xmin2) + + inter_xmax = min(xmax1, xmax2) + inter_ymax = min(ymax1, ymax2) + inter_xmin = max(xmin1, xmin2) + inter_ymin = max(ymin1, ymin2) + inter_height = inter_ymax - inter_ymin + inter_width = inter_xmax - inter_xmin + if not self.box_normalized: + inter_height += 1 + inter_width += 1 + inter_height = max(inter_height, 0) + inter_width = max(inter_width, 0) + inter_area = inter_width * inter_height + union_area = area1 + area2 - inter_area + sim_score = inter_area / union_area + self.output[row, col] = sim_score + + class TestXPUIOUSimilarityOpWithLoD(TestXPUIOUSimilarityOp): + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def setUp(self): + super().setUp() + self.boxes1_lod = [[1, 1]] + self.output_lod = [[1, 1]] + self.box_normalized = False + # run python iou computation + self._compute_iou() + self.inputs = { + 'X': (self.boxes1, self.boxes1_lod), + 'Y': self.boxes2 + } + self.attrs = {"box_normalized": self.box_normalized} + self.outputs = {'Out': (self.output, self.output_lod)} + + class TestXPUIOUSimilarityOpWithBoxNormalized(TestXPUIOUSimilarityOp): + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def setUp(self): + super().setUp() + self.boxes1_lod = [[1, 1]] + self.output_lod = [[1, 1]] + self.box_normalized = True + # run python iou computation + self._compute_iou() + self.inputs = { + 'X': (self.boxes1, self.boxes1_lod), + 'Y': self.boxes2 + } + self.attrs = {"box_normalized": self.box_normalized} + self.outputs = {'Out': (self.output, self.output_lod)} + + +support_types = get_xpu_op_support_types('iou_similarity') +for stype in support_types: + create_test_class(globals(), XPUTestIOUSimilarityOp, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py index afeccd637a265..d45e0ce34d42f 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py @@ -18,136 +18,130 @@ import numpy as np import paddle import paddle.fluid.core as core +import paddle.fluid as fluid import sys sys.path.append("..") from op_test_xpu import XPUOpTest -import paddle.fluid as fluid -from paddle.fluid import Program, program_guard -import time +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -class TestOneHotOp(XPUOpTest): +class XPUTestOneHotOp(XPUOpTestWrapper): - def setUp(self): - self.use_xpu = True - self.op_type = 'one_hot_v2' - depth = 10 - depth_np = np.array(10).astype('int32') - # dimension = 12 - x_lod = [[4, 1, 3, 3]] - x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] - x = np.array(x).astype('int32').reshape([sum(x_lod[0])]) + def __init__(self): + self.op_name = 'one_hot_v2' + self.use_dynamic_create_class = False - out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32') + class TestOneHotOp(XPUOpTest): - for i in range(np.product(x.shape)): - out[i, x[i]] = 1.0 + def init(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = 'one_hot_v2' - self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np} - self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)} - self.outputs = {'Out': (out, x_lod)} + def setUp(self): + self.init() + depth = 10 + depth_np = np.array(10).astype('int32') + # dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0])]) - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place, check_dygraph=False) + out = np.zeros(shape=(np.product(x.shape), + depth)).astype(self.dtype) + for i in range(np.product(x.shape)): + out[i, x[i]] = 1.0 -class TestOneHotOp_attr(XPUOpTest): + self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np} + self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)} + self.outputs = {'Out': (out, x_lod)} - def setUp(self): - self.op_type = 'one_hot_v2' - depth = 10 - dimension = 12 - x_lod = [[4, 1, 3, 3]] - x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] - x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) + def test_check_output(self): + self.check_output_with_place(self.place) - out = np.zeros(shape=(np.product(x.shape[:-1]), 1, - depth)).astype('float32') + class TestOneHotOp_attr(TestOneHotOp): - for i in range(np.product(x.shape)): - out[i, 0, x[i]] = 1.0 + def setUp(self): + self.init() + depth = 10 + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) - self.inputs = {'X': (x, x_lod)} - self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth} - self.outputs = {'Out': (out, x_lod)} + out = np.zeros(shape=(np.product(x.shape[:-1]), 1, + depth)).astype(self.dtype) - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place, check_dygraph=False) + for i in range(np.product(x.shape)): + out[i, 0, x[i]] = 1.0 + self.inputs = {'X': (x, x_lod)} + self.attrs = { + 'dtype': int(core.VarDesc.VarType.FP32), + 'depth': depth + } + self.outputs = {'Out': (out, x_lod)} -class TestOneHotOp_default_dtype(XPUOpTest): + class TestOneHotOp_default_dtype(TestOneHotOp): - def setUp(self): - self.op_type = 'one_hot_v2' - depth = 10 - depth_np = np.array(10).astype('int32') - dimension = 12 - x_lod = [[4, 1, 3, 3]] - x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] - x = np.array(x).astype('int32').reshape([sum(x_lod[0])]) + def setUp(self): + self.init() + depth = 10 + depth_np = np.array(10).astype('int32') + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0])]) - out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32') + out = np.zeros(shape=(np.product(x.shape), + depth)).astype(self.dtype) - for i in range(np.product(x.shape)): - out[i, x[i]] = 1.0 + for i in range(np.product(x.shape)): + out[i, x[i]] = 1.0 - self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np} - self.attrs = {} - self.outputs = {'Out': (out, x_lod)} + self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np} + self.attrs = {} + self.outputs = {'Out': (out, x_lod)} - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place, check_dygraph=False) + class TestOneHotOp_default_dtype_attr(TestOneHotOp): + def setUp(self): + self.init() + depth = 10 + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) -class TestOneHotOp_default_dtype_attr(XPUOpTest): + out = np.zeros(shape=(np.product(x.shape[:-1]), 1, + depth)).astype(self.dtype) - def setUp(self): - self.op_type = 'one_hot_v2' - depth = 10 - dimension = 12 - x_lod = [[4, 1, 3, 3]] - x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] - x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) + for i in range(np.product(x.shape)): + out[i, 0, x[i]] = 1.0 - out = np.zeros(shape=(np.product(x.shape[:-1]), 1, - depth)).astype('float32') + self.inputs = {'X': (x, x_lod)} + self.attrs = {'depth': depth} + self.outputs = {'Out': (out, x_lod)} - for i in range(np.product(x.shape)): - out[i, 0, x[i]] = 1.0 + class TestOneHotOp_out_of_range(TestOneHotOp): - self.inputs = {'X': (x, x_lod)} - self.attrs = {'depth': depth} - self.outputs = {'Out': (out, x_lod)} + def setUp(self): + self.init() + depth = 10 + x_lod = [[4, 1, 3, 3]] + x = [np.random.choice([-1, depth]) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0])]) - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place, check_dygraph=False) + out = np.zeros(shape=(np.product(x.shape), + depth)).astype(self.dtype) - -class TestOneHotOp_out_of_range(XPUOpTest): - - def setUp(self): - self.op_type = 'one_hot_v2' - depth = 10 - x_lod = [[4, 1, 3, 3]] - x = [np.random.choice([-1, depth]) for i in range(sum(x_lod[0]))] - x = np.array(x).astype('int32').reshape([sum(x_lod[0])]) - - out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32') - - self.inputs = {'X': (x, x_lod)} - self.attrs = {'depth': depth, 'allow_out_of_range': True} - self.outputs = {'Out': (out, x_lod)} - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place, check_dygraph=False) + self.inputs = {'X': (x, x_lod)} + self.attrs = {'depth': depth, 'allow_out_of_range': True} + self.outputs = {'Out': (out, x_lod)} class TestOneHotOpApi(unittest.TestCase): @@ -200,6 +194,9 @@ def test_bad_x(): self.assertRaises(TypeError, test_bad_x) +support_types = get_xpu_op_support_types('one_hot_v2') +for stype in support_types: + create_test_class(globals(), XPUTestOneHotOp, stype) + if __name__ == '__main__': - paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py index ef483870c68ee..90fe474e09cd1 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py @@ -19,196 +19,180 @@ import sys sys.path.append("..") -from op_test import OpTest, skip_check_grad_ci +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper import paddle -import paddle.fluid.core as core -import paddle.fluid as fluid -from paddle.fluid import compiler, Program, program_guard -from paddle.fluid.framework import convert_np_dtype_to_dtype_ +paddle.enable_static() -class TestMeanOp(OpTest): - def setUp(self): - self.op_type = "reduce_mean" - self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} - self.attrs = {'use_xpu': True} - self.outputs = {'Out': self.inputs['X'].mean(axis=0)} +class XPUTestMeanOp(XPUOpTestWrapper): - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) + def __init__(self): + self.op_name = 'reduce_mean' + self.use_dynamic_create_class = False - def check_grad_(self): - self.check_grad(['X'], 'Out') + class TestMeanOp(XPUOpTest): + def setUp(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'use_xpu': True} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} -class TestMeanOp5D(OpTest): + def test_check_output(self): + self.check_output_with_place(self.place) - def setUp(self): - self.op_type = "reduce_mean" - self.inputs = { - 'X': np.random.random((1, 2, 5, 6, 10)).astype("float32") - } - self.attrs = {'use_xpu': True} - self.outputs = {'Out': self.inputs['X'].mean(axis=0)} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - - -class TestMeanOp6D(OpTest): - - def setUp(self): - self.op_type = "reduce_mean" - self.inputs = { - 'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float32") - } - self.attrs = {'use_xpu': True} - self.outputs = {'Out': self.inputs['X'].mean(axis=0)} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - - -class TestMeanOp8D(OpTest): - - def setUp(self): - self.op_type = "reduce_mean" - self.inputs = { - 'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float32") - } - self.attrs = {'dim': (0, 3), 'use_xpu': True} - self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - - -class Test1DReduce(OpTest): - - def setUp(self): - self.op_type = "reduce_mean" - self.inputs = {'X': np.random.random(120).astype("float32")} - self.attrs = {'use_xpu': True} - self.outputs = {'Out': self.inputs['X'].mean(axis=0)} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - - -class Test2DReduce0(Test1DReduce): - - def setUp(self): - self.op_type = "reduce_mean" - self.attrs = {'dim': [0], 'use_xpu': True} - self.inputs = {'X': np.random.random((20, 10)).astype("float32")} - self.outputs = {'Out': self.inputs['X'].mean(axis=0)} - - -class Test2DReduce1(Test1DReduce): - - def setUp(self): - self.op_type = "reduce_mean" - self.attrs = {'dim': [1], 'use_xpu': True} - self.inputs = {'X': np.random.random((20, 10)).astype("float32")} - self.outputs = { - 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) - } - - -class Test3DReduce0(Test1DReduce): - - def setUp(self): - self.op_type = "reduce_mean" - self.attrs = {'dim': [1], 'use_xpu': True} - self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} - self.outputs = { - 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) - } - - -class Test3DReduce1(Test1DReduce): - - def setUp(self): - self.op_type = "reduce_mean" - self.attrs = {'dim': [2], 'use_xpu': True} - self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} - self.outputs = { - 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) - } - - -class Test3DReduce2(Test1DReduce): - - def setUp(self): - self.op_type = "reduce_mean" - self.attrs = {'dim': [-2], 'use_xpu': True} - self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} - self.outputs = { - 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) - } - - -class Test3DReduce3(Test1DReduce): - - def setUp(self): - self.op_type = "reduce_mean" - self.attrs = {'dim': [1, 2], 'use_xpu': True} - self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")} - self.outputs = { - 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) - } - - -class TestKeepDimReduce(Test1DReduce): - - def setUp(self): - self.op_type = "reduce_mean" - self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")} - self.attrs = {'dim': [1], 'keep_dim': True, 'use_xpu': True} - self.outputs = { - 'Out': - self.inputs['X'].mean(axis=tuple(self.attrs['dim']), - keepdims=self.attrs['keep_dim']) - } - - -class TestKeepDim8DReduce(Test1DReduce): - - def setUp(self): - self.op_type = "reduce_mean" - self.inputs = { - 'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float32") - } - self.attrs = {'dim': (3, 4, 5), 'keep_dim': True, 'use_xpu': True} - self.outputs = { - 'Out': - self.inputs['X'].mean(axis=tuple(self.attrs['dim']), - keepdims=self.attrs['keep_dim']) - } + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + class TestMeanOp5D(TestMeanOp): + + def setUp(self): + super().setUp() + self.inputs = { + 'X': np.random.random((1, 2, 5, 6, 10)).astype(self.dtype) + } + self.attrs = {'use_xpu': True} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + class TestMeanOp6D(TestMeanOp): + + def setUp(self): + super().setUp() + self.inputs = { + 'X': np.random.random((1, 1, 2, 5, 6, 10)).astype(self.dtype) + } + self.attrs = {'use_xpu': True} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + class TestMeanOp8D(TestMeanOp): + + def setUp(self): + super().setUp() + self.inputs = { + 'X': np.random.random( + (1, 3, 1, 2, 1, 4, 3, 10)).astype(self.dtype) + } + self.attrs = {'dim': (0, 3), 'use_xpu': True} + self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))} + + +class XPUTestReduce(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'reduce_mean' + self.use_dynamic_create_class = False + + class Test1DReduce(XPUOpTest): + + def setUp(self): + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.op_type = "reduce_mean" + self.inputs = {'X': np.random.random(120).astype(self.dtype)} + self.attrs = {'use_xpu': True} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + def test_check_output(self): + self.check_output_with_place(self.place) + + # There is a api bug in checking grad when dim[0] > 0 + # def test_check_grad(self): + # self.check_output_with_place(self.place, ['X'], 'Out') + + class Test2DReduce0(Test1DReduce): + + def setUp(self): + super().setUp() + self.attrs = {'dim': [0], 'use_xpu': True} + self.inputs = {'X': np.random.random((20, 10)).astype(self.dtype)} + self.outputs = {'Out': self.inputs['X'].mean(axis=0)} + + class Test2DReduce1(Test1DReduce): + + def setUp(self): + super().setUp() + self.attrs = {'dim': [1], 'use_xpu': True} + self.inputs = {'X': np.random.random((20, 10)).astype(self.dtype)} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + class Test3DReduce0(Test1DReduce): + + def setUp(self): + super().setUp() + self.attrs = {'dim': [1], 'use_xpu': True} + self.inputs = {'X': np.random.random((5, 6, 7)).astype(self.dtype)} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + class Test3DReduce1(Test1DReduce): + + def setUp(self): + super().setUp() + self.attrs = {'dim': [2], 'use_xpu': True} + self.inputs = {'X': np.random.random((5, 6, 7)).astype(self.dtype)} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + class Test3DReduce2(Test1DReduce): + + def setUp(self): + super().setUp() + self.attrs = {'dim': [-2], 'use_xpu': True} + self.inputs = {'X': np.random.random((5, 6, 7)).astype(self.dtype)} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + class Test3DReduce3(Test1DReduce): + + def setUp(self): + super().setUp() + self.attrs = {'dim': [1, 2], 'use_xpu': True} + self.inputs = {'X': np.random.random((5, 6, 7)).astype(self.dtype)} + self.outputs = { + 'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim'])) + } + + class TestKeepDimReduce(Test1DReduce): + + def setUp(self): + super().setUp() + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [1], 'keep_dim': True, 'use_xpu': True} + self.outputs = { + 'Out': + self.inputs['X'].mean(axis=tuple(self.attrs['dim']), + keepdims=self.attrs['keep_dim']) + } + + class TestKeepDim8DReduce(Test1DReduce): + + def setUp(self): + super().setUp() + self.inputs = { + 'X': np.random.random( + (2, 5, 3, 2, 2, 3, 4, 2)).astype(self.dtype) + } + self.attrs = {'dim': (3, 4, 5), 'keep_dim': True, 'use_xpu': True} + self.outputs = { + 'Out': + self.inputs['X'].mean(axis=tuple(self.attrs['dim']), + keepdims=self.attrs['keep_dim']) + } + + +support_types = get_xpu_op_support_types('reduce_mean') +for stype in support_types: + create_test_class(globals(), XPUTestMeanOp, stype) + create_test_class(globals(), XPUTestReduce, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py index 4c830b1e8729a..deebd2e02ff8a 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py @@ -20,208 +20,220 @@ import math import numpy as np import paddle.fluid.core as core -from op_test import OpTest, skip_check_grad_ci from op_test_xpu import XPUOpTest import paddle -import paddle.fluid as fluid -from paddle.fluid import Program, program_guard - - -class TestROIAlignOp(XPUOpTest): - - def set_data(self): - self.init_test_case() - self.make_rois() - self.calc_roi_align() - - self.inputs = { - 'X': self.x, - 'ROIs': (self.rois[:, 1:5], self.rois_lod), - } - self.attrs = { - 'spatial_scale': self.spatial_scale, - 'pooled_height': self.pooled_height, - 'pooled_width': self.pooled_width, - 'sampling_ratio': self.sampling_ratio, - 'aligned': self.continuous_coordinate - } - - self.outputs = {'Out': self.out_data} - - def init_test_case(self): - self.batch_size = 3 - self.channels = 3 - self.height = 8 - self.width = 6 - - self.xpu_version = core.get_xpu_device_version(0) - - # n, c, h, w - self.x_dim = (self.batch_size, self.channels, self.height, self.width) - - self.spatial_scale = 1.0 / 2.0 - self.pooled_height = 2 - self.pooled_width = 2 - self.sampling_ratio = -1 - if self.xpu_version == core.XPUVersion.XPU1: - self.continuous_coordinate = False - else: - self.continuous_coordinate = bool(np.random.randint(2)) - self.x = np.random.random(self.x_dim).astype('float32') - - def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w, - bin_size_h, bin_size_w): - count = roi_bin_grid_h * roi_bin_grid_w - bilinear_pos = np.zeros( - [self.channels, self.pooled_height, self.pooled_width, count, 4], - np.float32) - bilinear_w = np.zeros([self.pooled_height, self.pooled_width, count, 4], - np.float32) - for ph in range(self.pooled_width): - for pw in range(self.pooled_height): - c = 0 - for iy in range(roi_bin_grid_h): - y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \ - bin_size_h / roi_bin_grid_h - for ix in range(roi_bin_grid_w): - x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \ - bin_size_w / roi_bin_grid_w - if y < -1.0 or y > self.height or \ - x < -1.0 or x > self.width: - continue - if y <= 0: - y = 0 - if x <= 0: - x = 0 - y_low = int(y) - x_low = int(x) - if y_low >= self.height - 1: - y = y_high = y_low = self.height - 1 - else: - y_high = y_low + 1 - if x_low >= self.width - 1: - x = x_high = x_low = self.width - 1 - else: - x_high = x_low + 1 - ly = y - y_low - lx = x - x_low - hy = 1 - ly - hx = 1 - lx - for ch in range(self.channels): - bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low, - x_low] - bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low, - x_high] - bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high, - x_low] - bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high, - x_high] - bilinear_w[ph, pw, c, 0] = hy * hx - bilinear_w[ph, pw, c, 1] = hy * lx - bilinear_w[ph, pw, c, 2] = ly * hx - bilinear_w[ph, pw, c, 3] = ly * lx - c = c + 1 - return bilinear_pos, bilinear_w - - def calc_roi_align(self): - self.out_data = np.zeros( - (self.rois_num, self.channels, self.pooled_height, - self.pooled_width)).astype('float32') - - for i in range(self.rois_num): - roi = self.rois[i] - roi_batch_id = int(roi[0]) - x_i = self.x[roi_batch_id] - roi_offset = 0.5 if self.continuous_coordinate else 0 - roi_xmin = roi[1] * self.spatial_scale - roi_offset - roi_ymin = roi[2] * self.spatial_scale - roi_offset - roi_xmax = roi[3] * self.spatial_scale - roi_offset - roi_ymax = roi[4] * self.spatial_scale - roi_offset - roi_width = roi_xmax - roi_xmin - roi_height = roi_ymax - roi_ymin - if not self.continuous_coordinate: - roi_width = max(roi_width, 1) - roi_height = max(roi_height, 1) - bin_size_h = float(roi_height) / float(self.pooled_height) - bin_size_w = float(roi_width) / float(self.pooled_width) - roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \ - math.ceil(roi_height / self.pooled_height) - roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \ - math.ceil(roi_width / self.pooled_width) - count = int(roi_bin_grid_h * roi_bin_grid_w) - pre_size = count * self.pooled_width * self.pooled_height - bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin, - int(roi_bin_grid_h), - int(roi_bin_grid_w), - bin_size_h, bin_size_w) - for ch in range(self.channels): - align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1) - output_val = align_per_bin.mean(axis=-1) - self.out_data[i, ch, :, :] = output_val - - def make_rois(self): - rois = [] - self.rois_lod = [[]] - for bno in range(self.batch_size): - self.rois_lod[0].append(bno + 1) - for i in range(bno + 1): - x1 = np.random.random_integers( - 0, self.width // self.spatial_scale - self.pooled_width) - y1 = np.random.random_integers( - 0, self.height // self.spatial_scale - self.pooled_height) - - x2 = np.random.random_integers(x1 + self.pooled_width, - self.width // self.spatial_scale) - y2 = np.random.random_integers( - y1 + self.pooled_height, self.height // self.spatial_scale) - - roi = [bno, x1, y1, x2, y2] - rois.append(roi) - self.rois_num = len(rois) - self.rois = np.array(rois).astype("float32") - - def setUp(self): - self.op_type = "roi_align" - self.set_data() - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - if core.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, {'X'}, 'Out') - - -class TestROIAlignInLodOp(TestROIAlignOp): - - def set_data(self): - self.init_test_case() - self.make_rois() - self.calc_roi_align() - - seq_len = self.rois_lod[0] - - self.inputs = { - 'X': self.x, - 'ROIs': (self.rois[:, 1:5], self.rois_lod), - 'RoisNum': np.asarray(seq_len).astype('int32') - } - - self.attrs = { - 'spatial_scale': self.spatial_scale, - 'pooled_height': self.pooled_height, - 'pooled_width': self.pooled_width, - 'sampling_ratio': self.sampling_ratio, - 'aligned': self.continuous_coordinate - } - - self.outputs = {'Out': self.out_data} - +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +class XPUTestROIAlignOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'roi_align' + self.use_dynamic_create_class = False + + class TestROIAlignOp(XPUOpTest): + + def set_data(self): + self.init_test_case() + self.make_rois() + self.calc_roi_align() + + self.inputs = { + 'X': self.x, + 'ROIs': (self.rois[:, 1:5], self.rois_lod), + } + self.attrs = { + 'spatial_scale': self.spatial_scale, + 'pooled_height': self.pooled_height, + 'pooled_width': self.pooled_width, + 'sampling_ratio': self.sampling_ratio, + 'aligned': self.continuous_coordinate + } + + self.outputs = {'Out': self.out_data} + + def init_test_case(self): + self.batch_size = 3 + self.channels = 3 + self.height = 8 + self.width = 6 + + self.xpu_version = core.get_xpu_device_version(0) + + # n, c, h, w + self.x_dim = (self.batch_size, self.channels, self.height, + self.width) + + self.spatial_scale = 1.0 / 2.0 + self.pooled_height = 2 + self.pooled_width = 2 + self.sampling_ratio = -1 + if self.xpu_version == core.XPUVersion.XPU1: + self.continuous_coordinate = False + else: + self.continuous_coordinate = bool(np.random.randint(2)) + self.x = np.random.random(self.x_dim).astype(self.dtype) + + def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, + roi_bin_grid_w, bin_size_h, bin_size_w): + count = roi_bin_grid_h * roi_bin_grid_w + bilinear_pos = np.zeros([ + self.channels, self.pooled_height, self.pooled_width, count, 4 + ], np.float32) + bilinear_w = np.zeros( + [self.pooled_height, self.pooled_width, count, 4], np.float32) + for ph in range(self.pooled_width): + for pw in range(self.pooled_height): + c = 0 + for iy in range(roi_bin_grid_h): + y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \ + bin_size_h / roi_bin_grid_h + for ix in range(roi_bin_grid_w): + x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \ + bin_size_w / roi_bin_grid_w + if y < -1.0 or y > self.height or \ + x < -1.0 or x > self.width: + continue + if y <= 0: + y = 0 + if x <= 0: + x = 0 + y_low = int(y) + x_low = int(x) + if y_low >= self.height - 1: + y = y_high = y_low = self.height - 1 + else: + y_high = y_low + 1 + if x_low >= self.width - 1: + x = x_high = x_low = self.width - 1 + else: + x_high = x_low + 1 + ly = y - y_low + lx = x - x_low + hy = 1 - ly + hx = 1 - lx + for ch in range(self.channels): + bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low, + x_low] + bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low, + x_high] + bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high, + x_low] + bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high, + x_high] + bilinear_w[ph, pw, c, 0] = hy * hx + bilinear_w[ph, pw, c, 1] = hy * lx + bilinear_w[ph, pw, c, 2] = ly * hx + bilinear_w[ph, pw, c, 3] = ly * lx + c = c + 1 + return bilinear_pos, bilinear_w + + def calc_roi_align(self): + self.out_data = np.zeros( + (self.rois_num, self.channels, self.pooled_height, + self.pooled_width)).astype(self.dtype) + + for i in range(self.rois_num): + roi = self.rois[i] + roi_batch_id = int(roi[0]) + x_i = self.x[roi_batch_id] + roi_offset = 0.5 if self.continuous_coordinate else 0 + roi_xmin = roi[1] * self.spatial_scale - roi_offset + roi_ymin = roi[2] * self.spatial_scale - roi_offset + roi_xmax = roi[3] * self.spatial_scale - roi_offset + roi_ymax = roi[4] * self.spatial_scale - roi_offset + roi_width = roi_xmax - roi_xmin + roi_height = roi_ymax - roi_ymin + if not self.continuous_coordinate: + roi_width = max(roi_width, 1) + roi_height = max(roi_height, 1) + bin_size_h = float(roi_height) / float(self.pooled_height) + bin_size_w = float(roi_width) / float(self.pooled_width) + roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \ + math.ceil(roi_height / self.pooled_height) + roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \ + math.ceil(roi_width / self.pooled_width) + count = int(roi_bin_grid_h * roi_bin_grid_w) + pre_size = count * self.pooled_width * self.pooled_height + bilinear_pos, bilinear_w = self.pre_calc( + x_i, roi_xmin, roi_ymin, int(roi_bin_grid_h), + int(roi_bin_grid_w), bin_size_h, bin_size_w) + for ch in range(self.channels): + align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1) + output_val = align_per_bin.mean(axis=-1) + self.out_data[i, ch, :, :] = output_val + + def make_rois(self): + rois = [] + self.rois_lod = [[]] + for bno in range(self.batch_size): + self.rois_lod[0].append(bno + 1) + for i in range(bno + 1): + x1 = np.random.random_integers( + 0, self.width // self.spatial_scale - self.pooled_width) + y1 = np.random.random_integers( + 0, + self.height // self.spatial_scale - self.pooled_height) + + x2 = np.random.random_integers( + x1 + self.pooled_width, + self.width // self.spatial_scale) + y2 = np.random.random_integers( + y1 + self.pooled_height, + self.height // self.spatial_scale) + + roi = [bno, x1, y1, x2, y2] + rois.append(roi) + self.rois_num = len(rois) + self.rois = np.array(rois).astype(self.dtype) + + def setUp(self): + self.set_xpu() + self.op_type = "roi_align" + self.place = paddle.XPUPlace(0) + self.dtype = self.in_type + self.set_data() + + def set_xpu(self): + self.__class__.use_xpu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, {'X'}, 'Out') + + class TestROIAlignInLodOp(TestROIAlignOp): + + def set_data(self): + self.init_test_case() + self.make_rois() + self.calc_roi_align() + + seq_len = self.rois_lod[0] + + self.inputs = { + 'X': self.x, + 'ROIs': (self.rois[:, 1:5], self.rois_lod), + 'RoisNum': np.asarray(seq_len).astype('int32') + } + + self.attrs = { + 'spatial_scale': self.spatial_scale, + 'pooled_height': self.pooled_height, + 'pooled_width': self.pooled_width, + 'sampling_ratio': self.sampling_ratio, + 'aligned': self.continuous_coordinate + } + + self.outputs = {'Out': self.out_data} + + +support_types = get_xpu_op_support_types('roi_align') +for stype in support_types: + create_test_class(globals(), XPUTestROIAlignOp, stype) if __name__ == '__main__': unittest.main() From 7d3b08d965e64d9d8a44a4c31e88a454192005b5 Mon Sep 17 00:00:00 2001 From: helen88 Date: Tue, 5 Jul 2022 17:49:41 +0800 Subject: [PATCH 061/250] refactor mean op, *test=kunlun (#44000) * refactor mean op, *test=kunlun * refactor mean op, *test=kunlun * refactor mean op,*test=kunlun * refactor mean op,*test=kunlun --- .../fluid/tests/unittests/xpu/CMakeLists.txt | 3 - .../tests/unittests/xpu/test_mean_op_xpu.py | 113 +++++++++--------- 2 files changed, 58 insertions(+), 58 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt index 6267526f33c12..c6aaf363138d4 100644 --- a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt @@ -16,9 +16,6 @@ if(WITH_XPU_BKCL) list(APPEND DIST_TEST_OPS test_gen_bkcl_id_op) endif() -list(REMOVE_ITEM TEST_OPS test_concat_op_xpu) -list(REMOVE_ITEM TEST_OPS test_mean_op_xpu) - foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py index 892a5b6840ab9..cd21dcca4c0ab 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py @@ -28,29 +28,66 @@ np.random.seed(10) +import op_test +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +class XPUTestMeanOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'mean' + self.use_dynamic_create_class = False + + class TestMeanOp(XPUOpTest): + + def setUp(self): + self.init_dtype() + self.set_xpu() + self.op_type = "mean" + self.place = paddle.XPUPlace(0) + self.set_shape() + self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)} + self.outputs = {'Out': np.mean(self.inputs["X"]).astype(np.float16)} + + def init_dtype(self): + self.dtype = self.in_type + + def set_shape(self): + self.shape = (10, 10) + + def set_xpu(self): + self.__class__.use_xpu = True + self.__class__.no_need_check_grad = True + self.__class__.op_type = self.dtype + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_checkout_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') -class TestMeanOp(XPUOpTest): + class TestMeanOp1(TestMeanOp): - def setUp(self): - self.op_type = "mean" - self.init_dtype_type() - self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)} - self.outputs = {'Out': np.mean(self.inputs["X"]).astype(np.float16)} + def set_shape(self): + self.shape = (5) - def init_dtype_type(self): - self.dtype = np.float32 + class TestMeanOp2(TestMeanOp): - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_output_with_place(place, atol=2e-3) + def set_shape(self): + self.shape = (5, 7, 8) - def test_checkout_grad(self): - if paddle.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') + class TestMeanOp3(TestMeanOp): + + def set_shape(self): + self.shape = (10, 5, 7, 8) + + class TestMeanOp4(TestMeanOp): + + def set_shape(self): + self.shape = (2, 2, 3, 3, 3) class TestMeanOpError(unittest.TestCase): @@ -71,43 +108,9 @@ def test_errors(self): fluid.layers.softmax(input3) -class TestXPUMeanOp(TestMeanOp): - - def init_dtype_type(self): - self.dtype = np.float32 - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_checkout_grad(self): - if paddle.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out') - - -class TestXPUMeanOpFp16(TestMeanOp): - - def init_dtype_type(self): - self.dtype = np.float16 - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_checkout_grad(self): - if paddle.is_compiled_with_xpu(): - paddle.enable_static() - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], - 'Out', - max_relative_error=1.e1) - +support_types = get_xpu_op_support_types('mean') +for stype in support_types: + create_test_class(globals(), XPUTestMeanOp, stype) if __name__ == "__main__": unittest.main() From 59813de90d57198f78a0e3e29519f1a557490c36 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Tue, 5 Jul 2022 18:25:28 +0800 Subject: [PATCH 062/250] [Sparse] add SparseCsrTensor fused_attention kernel and API (#43966) * [Sparse] add SparseCsrTensor fused_attention kernel and API * fix comment --- paddle/phi/api/yaml/sparse_api.yaml | 9 + paddle/phi/api/yaml/sparse_bw_api.yaml | 7 + .../sparse/cpu/fused_attention_grad_kernel.cc | 38 +++ .../sparse/cpu/fused_attention_kernel.cc | 38 +++ .../sparse/fused_attention_grad_kernel.h | 35 +++ .../kernels/sparse/fused_attention_kernel.h | 35 +++ .../sparse/gpu/fused_attention_grad_kernel.cu | 153 ++++++++++ .../sparse/gpu/fused_attention_kernel.cu | 278 ++++++++++++++++++ .../phi/kernels/sparse/gpu/matmul_kernel.cu | 2 +- .../phi/kernels/sparse/gpu/softmax_kernel.cu | 5 +- .../test_sparse_fused_attention_op.py | 146 +++++++++ .../incubate/sparse/nn/functional/__init__.py | 2 + .../sparse/nn/functional/activation.py | 2 +- .../sparse/nn/functional/transformer.py | 94 ++++++ 14 files changed, 840 insertions(+), 4 deletions(-) create mode 100644 paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc create mode 100644 paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc create mode 100644 paddle/phi/kernels/sparse/fused_attention_grad_kernel.h create mode 100644 paddle/phi/kernels/sparse/fused_attention_kernel.h create mode 100644 paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu create mode 100644 paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py create mode 100644 python/paddle/incubate/sparse/nn/functional/transformer.py diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index e99009a70fc3b..a6520a0d48472 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -141,6 +141,15 @@ layout : x data_type : dtype +- api: fused_attention + args : (Tensor query, Tensor key, Tensor value, Tensor sparse_mask, Tensor key_padding_mask, Tensor attn_mask) + output : Tensor(out), Tensor(softmax) + kernel : + func : fused_attention_csr{dense, dense, dense, sparse_csr, dense, dense -> dense, sparse_csr} + layout : sparse_mask + intermediate : softmax + backward: fused_attention_grad + - api: masked_matmul args : (Tensor x, Tensor y, Tensor mask) output : Tensor(out) diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml index 6ceedb0978121..5296d1b870bee 100644 --- a/paddle/phi/api/yaml/sparse_bw_api.yaml +++ b/paddle/phi/api/yaml/sparse_bw_api.yaml @@ -127,3 +127,10 @@ output : Tensor(x_grad) kernel : func : coo_values_grad{sparse_coo, dense-> sparse_coo} + +- backward_api: fused_attention_grad + forward : fused_attention_csr(Tensor query, Tensor key, Tensor value, Tensor sparse_mask, Tensor key_padding_mask, Tensor attn_mask) -> Tensor(out), Tensor(softmax) + args: (Tensor query, Tensor key, Tensor value, Tensor softmax, Tensor out_grad) + output : Tensor(query_grad), Tensor(key_grad), Tensor(value_grad) + kernel : + func : fused_attention_csr_grad{dense, dense, dense, sparse_csr, dense -> dense, dense, dense} diff --git a/paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc new file mode 100644 index 0000000000000..416b715a9a6a2 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/fused_attention_grad_kernel.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/fused_attention_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace sparse { + +template +void FusedAttentionCsrGradKernel(const Context& dev_ctx, + const DenseTensor& query, + const DenseTensor& key, + const DenseTensor& value, + const SparseCsrTensor& softmax, + const DenseTensor& dout, + DenseTensor* dquery, + DenseTensor* dkey, + DenseTensor* dvalue) { + PD_THROW( + "Not support CPU kernel of 'sparse.nn.functional.fused_attention' now"); +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc b/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc new file mode 100644 index 0000000000000..6c652c6a8c4d6 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc @@ -0,0 +1,38 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/fused_attention_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace sparse { + +template +void FusedAttentionCsrKernel(const Context& dev_ctx, + const DenseTensor& query, + const DenseTensor& key, + const DenseTensor& value, + const SparseCsrTensor& sparse_mask, + const DenseTensor& key_padding_mask, + const DenseTensor& attn_mask, + DenseTensor* out, + SparseCsrTensor* softmax) { + PD_THROW( + "Not support CPU kernel of 'sparse.nn.functional.fused_attention' now"); +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/fused_attention_grad_kernel.h b/paddle/phi/kernels/sparse/fused_attention_grad_kernel.h new file mode 100644 index 0000000000000..0a025d21f94f3 --- /dev/null +++ b/paddle/phi/kernels/sparse/fused_attention_grad_kernel.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" + +namespace phi { +namespace sparse { + +template +void FusedAttentionCsrGradKernel(const Context& dev_ctx, + const DenseTensor& query, + const DenseTensor& key, + const DenseTensor& value, + const SparseCsrTensor& softmax, + const DenseTensor& dout, + DenseTensor* dquery, + DenseTensor* dkey, + DenseTensor* dvalue); + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/fused_attention_kernel.h b/paddle/phi/kernels/sparse/fused_attention_kernel.h new file mode 100644 index 0000000000000..feff9d72e644c --- /dev/null +++ b/paddle/phi/kernels/sparse/fused_attention_kernel.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" + +namespace phi { +namespace sparse { + +template +void FusedAttentionCsrKernel(const Context& dev_ctx, + const DenseTensor& query, + const DenseTensor& key, + const DenseTensor& value, + const SparseCsrTensor& sparse_mask, + const DenseTensor& key_padding_mask, + const DenseTensor& attn_mask, + DenseTensor* out, + SparseCsrTensor* softmax); + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu new file mode 100644 index 0000000000000..4d31ad96cdd3b --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu @@ -0,0 +1,153 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sparse/fused_attention_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_cuda_utils.h" +#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h" +#include "paddle/phi/kernels/sparse/empty_kernel.h" +#include "paddle/phi/kernels/sparse/matmul_grad_kernel.h" + +namespace phi { +namespace sparse { + +template +__global__ void AttnSoftmaxGpuGradKernel(const int64_t* out_crows, + const T* out_values, + const T* dout_values, + T* dx_values, + int M, + int total_row_num, + float scale, + int batch_nnz) { + // dx = (dout - sum(dout * out)) * out + int row = blockIdx.x * blockDim.y + threadIdx.y; + if (row >= total_row_num) return; + + int cur_batch = row / M; + int crow_idx = cur_batch * (M + 1) + (row % M); + int row_first = cur_batch * batch_nnz + static_cast(out_crows[crow_idx]); + int row_nnz = static_cast(out_crows[crow_idx + 1] - out_crows[crow_idx]); + if (row_nnz == 0) return; + + int kIteration = (row_nnz + WARP_SIZE - 1) / WARP_SIZE; + T mul_result = 0; + for (int i = 0; i < kIteration; ++i) { + int idx = threadIdx.x + i * WARP_SIZE; + if (idx >= row_nnz) break; + + mul_result += out_values[row_first + idx] * dout_values[row_first + idx]; + } + T sum = phi::funcs::warpReduceSum(mul_result, 0xFFFFFFFF); + + for (int i = 0; i < kIteration; ++i) { + int idx = threadIdx.x + i * WARP_SIZE; + if (idx >= row_nnz) break; + + dx_values[row_first + idx] = (dout_values[row_first + idx] - sum) * + out_values[row_first + idx] / scale; + } +} + +template +void FusedAttentionCsrGradKernel(const Context& dev_ctx, + const DenseTensor& query, + const DenseTensor& key, + const DenseTensor& value, + const SparseCsrTensor& softmax, + const DenseTensor& dout, + DenseTensor* dquery, + DenseTensor* dkey, + DenseTensor* dvalue) { +#if CUDA_VERSION >= 11070 + /* Step1: Forward: softmax{CSR} * value{Dense} -> out{Dense}, reuse */ + SparseCsrTensor dsoftmax; + CsrDenseMatmulGradKernel( + dev_ctx, softmax, value, dout, &dsoftmax, dvalue); + + /* Step2: Calculate grad of sdd_result, manualy not reuse */ + SparseCsrTensor d_sdd_result; + EmptyLikeCsrKernel(dev_ctx, dsoftmax, &d_sdd_result); + auto q_dim = query.dims(); + auto q_rank = q_dim.size(); + + int total_row_num = 1; + int batch_num = 1; + for (int i = 0; i < q_rank - 1; ++i) { + total_row_num *= q_dim[i]; + if (i < q_rank - 2) { + batch_num *= q_dim[i]; + } + } + int M = q_dim[q_rank - 2]; + int N = q_dim[q_rank - 1]; + int batch_nnz = softmax.nnz() / batch_num; + + dim3 grid((total_row_num + 3) / 4); + dim3 block(WARP_SIZE, 4); + + AttnSoftmaxGpuGradKernel<<>>( + softmax.non_zero_crows().data(), + softmax.non_zero_elements().data(), + dsoftmax.mutable_non_zero_elements()->data(), + d_sdd_result.mutable_non_zero_elements()->data(), + M, + total_row_num, + std::sqrt(N), + batch_nnz); + + /* Step3: Forward: query{Dense} * key'{Dense} -> sdd_result{SparseCsr} */ + auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); + // dquery{Dense} = d_sdd_result{SparseCsr} * key{Dense} // + dquery->Resize(query.dims()); + dev_ctx.template Alloc(dquery); + sparse_blas.SPMM(false, + false, + static_cast(1.f), + d_sdd_result, + key, + static_cast(0.f), + dquery); + + // dkey{Dense} = d_sdd_result'{SparseCsr} * query{Dense} // + dkey->Resize(key.dims()); + dev_ctx.template Alloc(dkey); + sparse_blas.SPMM(true, + false, + static_cast(1.f), + d_sdd_result, + query, + static_cast(0.f), + dkey); +#else + PADDLE_THROW( + phi::errors::Unimplemented("backward of 'sparse.nn.functional.attention' " + "use 'cusparseCsrSetStridedBatch', which is " + "completed supported from CUDA 11.7")); +#endif +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(fused_attention_csr_grad, + GPU, + ALL_LAYOUT, + phi::sparse::FusedAttentionCsrGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); +} diff --git a/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu new file mode 100644 index 0000000000000..9a7e55d2d6210 --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu @@ -0,0 +1,278 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/fused_attention_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" +#include "paddle/phi/kernels/funcs/math_cuda_utils.h" +#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h" +#include "paddle/phi/kernels/sparse/empty_kernel.h" +#include "paddle/phi/kernels/sparse/matmul_kernel.h" +#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h" + +namespace phi { +namespace sparse { + +#define PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, size, HINT, ...) \ + case size: { \ + constexpr int HINT = size; \ + __VA_ARGS__(); \ + break; \ + } + +#define VISIT_ATTN_SFOTMAX(SIZE, NAME, ...) \ + [&] { \ + const auto& __size__ = SIZE; \ + switch (__size__) { \ + PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 1, KBufferSize, __VA_ARGS__) \ + PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 2, KBufferSize, __VA_ARGS__) \ + PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 3, KBufferSize, __VA_ARGS__) \ + PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 4, KBufferSize, __VA_ARGS__) \ + PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 8, KBufferSize, __VA_ARGS__) \ + PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 12, KBufferSize, __VA_ARGS__) \ + PRIVATE_CASE_VISIT_ATTN_SOFTMAX(NAME, 16, KBufferSize, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for columns>512 "); \ + } \ + }() + +template +__global__ void AttnSoftmaxGpuKernel(const int64_t* x_crows, + const int64_t* x_cols, + const T* x_values, + const T* kp_mask, + const T* attn_mask, + T* out_values, + int M, + int total_row_num, + float scale, + int num_heads, + int batch_nnz) { + // out = exp(x-x_max) / sum(exp(x-x_max)) + int row = blockIdx.x * blockDim.y + threadIdx.y; + if (row >= total_row_num) return; + + int cur_batch = row / M; + int cur_row = row % M; + int crow_idx = cur_batch * (M + 1) + cur_row; + int row_first = cur_batch * batch_nnz + static_cast(x_crows[crow_idx]); + int row_nnz = static_cast(x_crows[crow_idx + 1] - x_crows[crow_idx]); + if (row_nnz == 0) return; + + T buffer[BufferSize] = {0}; + int kIteration = (row_nnz + WARP_SIZE - 1) / WARP_SIZE; + + T max_val = -std::numeric_limits::infinity(); + for (int i = 0; i < kIteration; ++i) { + bool mask = false; + int idx = threadIdx.x + i * WARP_SIZE; + if (idx >= row_nnz) break; + + int col_idx = static_cast(x_cols[row_first + idx]); + + if (kp_mask != nullptr && + kp_mask[(cur_batch / num_heads) * M + col_idx] == 0) { + mask = true; + } + if (attn_mask != nullptr && attn_mask[cur_row * M + col_idx] == 0) { + mask = true; + } + + if (!mask) { + buffer[i] = x_values[row_first + idx] / scale; + if (buffer[i] > max_val) { + max_val = buffer[i]; + } + } + } + T row_max_val = phi::funcs::warpReduceMax(max_val, 0xFFFFFFFF); + + auto functor = phi::funcs::CudaExpFunctor(); + T exp_sum = 0; + for (int i = 0; i < kIteration; ++i) { + int idx = threadIdx.x + i * WARP_SIZE; + if (idx >= row_nnz) break; + + if (buffer[i]) { + T exp = functor(buffer[i] - row_max_val); + exp_sum += exp; + buffer[i] = exp; + } + } + T row_exp_sum = phi::funcs::warpReduceSum(exp_sum, 0xFFFFFFFF); + + for (int i = 0; i < kIteration; ++i) { + int idx = threadIdx.x + i * WARP_SIZE; + if (idx >= row_nnz) break; + + if (buffer[i]) { + out_values[row_first + idx] = buffer[i] / row_exp_sum; + } else { + out_values[row_first + idx] = static_cast(0); + } + } +} + +template +void FusedAttentionCsrKernel(const Context& dev_ctx, + const DenseTensor& query, + const DenseTensor& key, + const DenseTensor& value, + const SparseCsrTensor& sparse_mask, + const DenseTensor& key_padding_mask, + const DenseTensor& attn_mask, + DenseTensor* out, + SparseCsrTensor* softmax) { +#if CUDA_VERSION >= 11070 + /* Check Shape */ + auto q_dim = query.dims(); + auto q_rank = q_dim.size(); + + int total_row_num = 1; + int batch_num = 1; + for (int i = 0; i < q_rank - 1; ++i) { + total_row_num *= q_dim[i]; + if (i < q_rank - 2) { + batch_num *= q_dim[i]; + } + } + int M = q_dim[q_rank - 2]; + int N = q_dim[q_rank - 1]; + + PADDLE_ENFORCE_EQ(query.dims().size(), + 4, + phi::errors::InvalidArgument(" 'query' must be 4D Tensor")); + PADDLE_ENFORCE_EQ(key.dims().size(), + 4, + phi::errors::InvalidArgument(" 'key' must be 4D Tensor")); + PADDLE_ENFORCE_EQ(value.dims().size(), + 4, + phi::errors::InvalidArgument(" 'value' must be 4D Tensor")); + + PADDLE_ENFORCE_EQ( + sparse_mask.dims().size(), + 3, + phi::errors::InvalidArgument("dense shape of 'sparse_mask' must be " + "[batch_size*num_heads, seq_len, seq_len]")); + PADDLE_ENFORCE_EQ( + sparse_mask.dims()[0], + q_dim[0] * q_dim[1], + phi::errors::InvalidArgument("dense shape of 'sparse_mask' must be " + "[batch_size*num_heads, seq_len, seq_len]")); + PADDLE_ENFORCE_EQ( + sparse_mask.dims()[1], + M, + phi::errors::InvalidArgument("dense shape of 'sparse_mask' must be " + "[batch_size*num_heads, seq_len, seq_len]")); + PADDLE_ENFORCE_EQ( + sparse_mask.dims()[2], + M, + phi::errors::InvalidArgument("dense shape of 'sparse_mask' must be " + "[batch_size*num_heads, seq_len, seq_len]")); + + PADDLE_ENFORCE_EQ( + key_padding_mask.dims().size(), + 2, + phi::errors::InvalidArgument( + "shape of 'key_padding_mask' must be [batch_size, seq_len]")); + PADDLE_ENFORCE_EQ( + key_padding_mask.dims()[0], + q_dim[0], + phi::errors::InvalidArgument( + "shape of 'key_padding_mask' must be [batch_size, seq_len]")); + PADDLE_ENFORCE_EQ( + key_padding_mask.dims()[1], + M, + phi::errors::InvalidArgument( + "shape of 'key_padding_mask' must be [batch_size, seq_len]")); + + PADDLE_ENFORCE_EQ(attn_mask.dims().size(), + 2, + phi::errors::InvalidArgument( + "shape of 'attn_mask' must be [seq_len, seq_len]")); + PADDLE_ENFORCE_EQ(attn_mask.dims()[0], + M, + phi::errors::InvalidArgument( + "shape of 'attn_mask' must be [seq_len, seq_len]")); + PADDLE_ENFORCE_EQ(attn_mask.dims()[1], + M, + phi::errors::InvalidArgument( + "shape of 'attn_mask' must be [seq_len, seq_len]")); + + /* Step1: SDD Matmul, reuse */ + SparseCsrTensor sdd_result; + EmptyLikeCsrKernel(dev_ctx, sparse_mask, &sdd_result); + auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); + sparse_blas.SDDMM(false, + true, + static_cast(1), + query, + key, + static_cast(0), + &sdd_result); + + /* Step2: Softmax with kp_mask/attn_mask, manualy not reuse */ + EmptyLikeCsrKernel(dev_ctx, sdd_result, softmax); + + int buffer_size; + if (M < 128) { + buffer_size = (M + 32 - 1) / 32; + } else { + buffer_size = ((M + 128 - 1) / 128) * 4; + } + + dim3 grid((total_row_num + 3) / 4); + dim3 block(WARP_SIZE, 4); + + int batch_nnz = sdd_result.nnz() / batch_num; + + VISIT_ATTN_SFOTMAX(buffer_size, "AttnSoftmaxGpuKernel", [&] { + AttnSoftmaxGpuKernel<<>>( + sdd_result.non_zero_crows().data(), + sdd_result.non_zero_cols().data(), + sdd_result.non_zero_elements().data(), + key_padding_mask.data(), + attn_mask.data(), + softmax->mutable_non_zero_elements()->data(), + M, + total_row_num, + std::sqrt(N), + q_dim[1], + batch_nnz); + }); + + /* Step3: DSD Matmul, reuse */ + softmax->set_dims(phi::make_ddim({q_dim[0], q_dim[1], q_dim[2], q_dim[2]})); + CsrDenseMatmulKernel(dev_ctx, *softmax, value, out); +#else + PADDLE_THROW( + phi::errors::Unimplemented("forward of 'sparse.nn.functional.attention' " + "use 'cusparseCsrSetStridedBatch', which is " + "completed supported from CUDA 11.7")); +#endif +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(fused_attention_csr, + GPU, + ALL_LAYOUT, + phi::sparse::FusedAttentionCsrKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); +} diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index 9357bbd2ad083..69cd4bac0c763 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -76,7 +76,7 @@ void CsrDenseMatmulKernel(const Context& dev_ctx, out_dim_vec[y_ndims - 1] = ydim_vec[y_ndims - 1]; MetaTensor meta_out(out); meta_out.set_dims(phi::make_ddim(out_dim_vec)); - meta_out.set_dtype(x.non_zero_elements().dtype()); + meta_out.set_dtype(y.dtype()); dev_ctx.template Alloc(out); diff --git a/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu index 9c9f5cfbca545..ee0671b333f81 100644 --- a/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/softmax_kernel.cu @@ -52,8 +52,9 @@ __global__ void SoftmaxGpuKernel(const IntT* x_crows, int idx = non_zero_idx + i * warpSize; if (idx >= row_nnz) break; - if (max_val < x_values[row_first + idx]) { - max_val = x_values[row_first + idx]; + T val = x_values[row_first + idx]; + if (val > max_val) { + max_val = val; } } T row_max_val = phi::funcs::warpReduceMax(max_val, 0xFFFFFFFF); diff --git a/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py new file mode 100644 index 0000000000000..e34f890cc53d4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py @@ -0,0 +1,146 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import math +import re +import copy +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard + + +def get_cuda_version(): + result = os.popen("nvcc --version").read() + regex = r'release (\S+),' + match = re.search(regex, result) + if match: + num = str(match.group(1)) + integer, decimal = num.split('.') + return int(integer) * 1000 + int(float(decimal) * 10) + else: + return -1 + + +@unittest.skipIf( + not core.is_compiled_with_cuda() or get_cuda_version() < 11070, + "core is not compiled with CUDA and cuda version need larger than or equal to 11.3" +) +class TestSparseAttentionAPI1(unittest.TestCase): + + def setUp(self): + self.batch_size = 16 + self.num_heads = 16 + self.seq_len = 128 + self.head_dim = 16 + self.dtype = 'float64' + + def test_dygraph(self): + with _test_eager_guard(): + self.shape = [ + self.batch_size, self.num_heads, self.seq_len, self.head_dim + ] + query = paddle.rand(self.shape, self.dtype) + key = paddle.rand(self.shape, self.dtype) + value = paddle.rand(self.shape, self.dtype) + + query.stop_gradient = False + key.stop_gradient = False + value.stop_gradient = False + + mask = paddle.nn.functional.dropout(paddle.ones( + [self.seq_len, self.seq_len]), + mode='downscale_in_infer') + mask = mask.expand( + [self.batch_size, self.num_heads, self.seq_len, self.seq_len]) + sp_mask = mask.reshape([-1, self.seq_len, + self.seq_len]).to_sparse_csr() + + kp_mask = paddle.randint( + 0, 2, [self.batch_size, self.seq_len]).astype(self.dtype) + attn_mask = paddle.randint( + 0, 2, [self.seq_len, self.seq_len]).astype(self.dtype) + + sdd = paddle.matmul(query, key, False, True) / math.sqrt( + float(self.head_dim)) + sdd = sdd + ( + (mask * kp_mask.unsqueeze([1, 2]) * attn_mask) - 1.0) * 1e9 + softmax = paddle.nn.functional.softmax(sdd) + output = paddle.matmul(softmax, value) + output.backward() + + query_cp = copy.deepcopy(query) + key_cp = copy.deepcopy(key) + value_cp = copy.deepcopy(value) + + query_cp.stop_gradient = False + key_cp.stop_gradient = False + value_cp.stop_gradient = False + + output_cp = paddle.incubate.sparse.nn.functional.attention( + query_cp, key_cp, value_cp, sp_mask, kp_mask, attn_mask) + output_cp.backward() + + self.assertTrue(np.allclose(output_cp.numpy(), output.numpy())) + self.assertTrue( + np.allclose(query_cp.grad.numpy(), query.grad.numpy())) + self.assertTrue(np.allclose(key_cp.grad.numpy(), key.grad.numpy())) + self.assertTrue( + np.allclose(value_cp.grad.numpy(), value.grad.numpy())) + + +class TestSparseAttentionAPI2(TestSparseAttentionAPI1): + + def setUp(self): + self.batch_size = 16 + self.num_heads = 16 + self.seq_len = 128 + self.head_dim = 32 + self.dtype = 'float64' + + +class TestSparseAttentionAPI3(TestSparseAttentionAPI1): + + def setUp(self): + self.batch_size = 16 + self.num_heads = 16 + self.seq_len = 512 + self.head_dim = 16 + self.dtype = 'float64' + + +class TestSparseAttentionAPI4(TestSparseAttentionAPI1): + + def setUp(self): + self.batch_size = 16 + self.num_heads = 16 + self.seq_len = 512 + self.head_dim = 32 + self.dtype = 'float64' + + +class TestSparseAttentionAPI5(TestSparseAttentionAPI1): + + def setUp(self): + self.batch_size = 16 + self.num_heads = 16 + self.seq_len = 512 + self.head_dim = 64 + self.dtype = 'float64' + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/incubate/sparse/nn/functional/__init__.py b/python/paddle/incubate/sparse/nn/functional/__init__.py index af5636a622a9a..21939eeb1a4f9 100644 --- a/python/paddle/incubate/sparse/nn/functional/__init__.py +++ b/python/paddle/incubate/sparse/nn/functional/__init__.py @@ -14,6 +14,7 @@ from .conv import conv3d # noqa: F401 from .conv import subm_conv3d # noqa: F401 +from .transformer import attention # noqa: F401 from .pooling import max_pool3d # noqa: F401 from .activation import relu # noqa: F401 from .activation import softmax # noqa: F401 @@ -24,4 +25,5 @@ 'max_pool3d', 'relu', 'softmax', + 'attention', ] diff --git a/python/paddle/incubate/sparse/nn/functional/activation.py b/python/paddle/incubate/sparse/nn/functional/activation.py index 12d44063e0015..dc2969424086e 100644 --- a/python/paddle/incubate/sparse/nn/functional/activation.py +++ b/python/paddle/incubate/sparse/nn/functional/activation.py @@ -14,7 +14,7 @@ __all__ = [] -from paddle import _C_ops, in_dynamic_mode +from paddle import _C_ops from paddle.fluid.framework import dygraph_only diff --git a/python/paddle/incubate/sparse/nn/functional/transformer.py b/python/paddle/incubate/sparse/nn/functional/transformer.py new file mode 100644 index 0000000000000..3429eecccd758 --- /dev/null +++ b/python/paddle/incubate/sparse/nn/functional/transformer.py @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [] + +from paddle import _C_ops +from paddle.fluid.framework import dygraph_only + + +@dygraph_only +def attention(query, + key, + value, + sparse_mask, + key_padding_mask, + attn_mask, + name=None): + """ + Note: + This API is only used from ``CUDA 11.7`` . + + SparseCsrTensor is used to store the intermediate result of Attention matrix + in Transformer module, which can reduce memory usage and improve performance. + ``sparse_mask`` express the sparse layout in CSR format. + The calculation equation is: + + .. math:: + + result = softmax(\frac{ Q * K^T }{\sqrt{d}}) * V + + where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. + The shape of the three parameters are: `[batch_size, num_heads, seq_len, head_dim]`, and + ``d`` represents ``head_dim`` . + + Args: + query(DenseTensor): `query` in the Attention module. 4D Tensor with float32 or float64. + key(DenseTensor): `key` in the Attention module. 4D Tensor with float32 or float64. + value(DenseTensor): `value` in the Attention module. 4D Tensor with float32 or float64. + sparse_mask(SparseCsrTensor): The sparse layout in the Attention module. Its dense shape + is `[batch_size*num_heads, seq_len, seq_len]` . `nnz` of each batch must be the same. + dtype of `crows` and `cols` must be int64, dtype of `values` can be float32 or float64. + key_padding_mask(DenseTensor): The key padding mask tensor in the Attention module. + 2D tensor with shape: [batch_size, seq_len]. dtype can be float32 or float64. + attn_mask(DenseTensor):The attention mask tensor in the Attention module. + 2D tensor with shape: [seq_len, seq_len]. dtype can be float32 or float64. + name(str, optional): The default value is None. Normally there is no need for user + to set this property. For more information, please refer to + :ref:`api_guide_Name`. + + Returns: + 4D tensor with shape: [batch_size, num_heads, seq_len, head_dim]. dtype is same with input. + + Examples: + .. code-block:: python + + import paddle + + batch_size = 16 + num_heads = 16 + seq_len = 512 + head_dim = 32 + + query = paddle.rand([batch_size, num_heads, seq_len, head_dim]) + key = paddle.rand([batch_size, num_heads, seq_len, head_dim]) + value = paddle.rand([batch_size, num_heads, seq_len, head_dim]) + + query.stop_gradient = False + key.stop_gradient = False + value.stop_gradient = False + + mask = paddle.nn.functional.dropout(paddle.ones([seq_len, seq_len])).expand([batch_size, num_heads, seq_len, seq_len]) + sp_mask = mask.reshape([-1, seq_len, seq_len]).to_sparse_csr() + + kp_mask = paddle.randint(0, 2, [batch_size, seq_len]) + attn_mask = paddle.randint(0, 2, [seq_len, seq_len]) + + output = paddle.incubate.sparse.nn.functional.attention(query, key, value, sp_mask, kp_mask, attn_mask) + output.backward() + """ + return _C_ops.final_state_sparse_fused_attention(query, key, value, + sparse_mask, + key_padding_mask, + attn_mask) From e0d7d790112af610a03624602f3361bacde496eb Mon Sep 17 00:00:00 2001 From: Zuza Gawrysiak Date: Tue, 5 Jul 2022 13:08:38 +0200 Subject: [PATCH 063/250] Refactor quantization of immutable ops (#43973) * Refactor quantization of immutable ops * Fix code formatting * Fix formatting * Specify input names * Fix formatting * Change string to reference * Formatting --- .../framework/ir/graph_pattern_detector.cc | 85 ++----- .../framework/ir/graph_pattern_detector.h | 69 ++--- .../framework/ir/mkldnn/cpu_quantize_pass.cc | 235 +++--------------- .../framework/ir/mkldnn/cpu_quantize_pass.h | 9 +- .../ir/mkldnn/cpu_quantize_pass_tester.cc | 60 ++--- 5 files changed, 87 insertions(+), 371 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 7ad02fe5ab87f..154df498e7d13 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1802,80 +1802,23 @@ PDNode *patterns::Conv::operator()() { return output_var; } -PDNode *patterns::Transpose::operator()() { +PDNode *patterns::Immutable::operator()(const std::string &immutable_type, + const std::string &input_name) { auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op(); - auto transpose_op = - pattern->NewNode(transpose_op_repr())->assert_is_op("transpose2"); + auto immutable_op = + pattern->NewNode(immutable_op_repr())->assert_is_op(immutable_type); - auto transpose_in = pattern->NewNode(transpose_in_repr()) + auto immutable_in = pattern->NewNode(immutable_in_repr()) ->AsInput() - ->assert_is_op_input("transpose2"); - auto transpose_out = pattern->NewNode(transpose_out_repr()) + ->assert_is_op_input(immutable_type, input_name); + auto immutable_out = pattern->NewNode(immutable_out_repr()) ->AsOutput() - ->assert_is_op_output("transpose2", "Out"); + ->assert_is_op_output(immutable_type, "Out"); - prev_op->LinksTo({transpose_in}); - transpose_op->LinksFrom({transpose_in}).LinksTo({transpose_out}); - return transpose_out; -} - -PDNode *patterns::Reshape::operator()() { - auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op(); - - auto reshape_op = - pattern->NewNode(reshape_op_repr())->assert_is_op("reshape2"); - - auto reshape_in = pattern->NewNode(reshape_in_repr()) - ->AsInput() - ->assert_is_op_input("reshape2", "X"); - auto reshape_out = pattern->NewNode(reshape_out_repr()) - ->AsOutput() - ->assert_is_op_output("reshape2", "Out"); - - prev_op->LinksTo({reshape_in}); - reshape_op->LinksFrom({reshape_in}).LinksTo({reshape_out}); - return reshape_out; -} - -PDNode *patterns::Slice::operator()() { - auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op(); - - auto slice_op = pattern->NewNode(slice_op_repr())->assert_is_op("slice"); - - auto slice_in = pattern->NewNode(slice_in_repr()) - ->AsInput() - ->assert_is_op_input("slice", "Input"); - auto slice_out = pattern->NewNode(slice_out_repr()) - ->AsOutput() - ->assert_is_op_output("slice", "Out"); - - prev_op->LinksTo({slice_in}); - slice_op->LinksFrom({slice_in}).LinksTo({slice_out}); - return slice_out; -} - -PDNode *patterns::NearestInterp::operator()() { - auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op(); - - auto nearest_interp_op = - pattern->NewNode(nearest_interp_op_repr()) - ->assert_is_ops({"nearest_interp", "nearest_interp_v2"}); - - auto nearest_interp_in = - pattern->NewNode(nearest_interp_in_repr()) - ->AsInput() - ->assert_is_ops_input({"nearest_interp", "nearest_interp_v2"}, "X"); - auto nearest_interp_out = - pattern->NewNode(nearest_interp_out_repr()) - ->AsOutput() - ->assert_is_ops_output({"nearest_interp", "nearest_interp_v2"}, - "Out"); - - prev_op->LinksTo({nearest_interp_in}); - nearest_interp_op->LinksFrom({nearest_interp_in}) - .LinksTo({nearest_interp_out}); - return nearest_interp_out; + prev_op->LinksTo({immutable_in}); + immutable_op->LinksFrom({immutable_in}).LinksTo({immutable_out}); + return immutable_out; } PDNode *patterns::Matmul::operator()() { @@ -2118,7 +2061,7 @@ PDNode *patterns::Pool::operator()() { PDNode *patterns::Elementwise::operator()(PDNode *x_var, PDNode *y_var, - const std::string elementwise_type) { + const std::string &elementwise_type) { auto elementwise_op = pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type); @@ -2135,7 +2078,7 @@ PDNode *patterns::Elementwise::operator()(PDNode *x_var, } PDNode *patterns::ElementwiseOp::operator()( - const std::string elementwise_type) { + const std::string &elementwise_type) { auto elementwise_op = pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type); @@ -2151,7 +2094,7 @@ PDNode *patterns::ElementwiseOp::operator()( PDNode *patterns::ResidualElementwise::operator()( PDNode *op_var, PDNode *residual_var, - const std::string elementwise_type, + const std::string &elementwise_type, bool as_x) { auto elementwise_op = pattern->NewNode(elementwise_op_repr())->assert_is_op(elementwise_type); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 29d645f6beba0..be14ef2dbf3ea 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1087,7 +1087,7 @@ struct Elementwise : public PatternBase { PDNode* operator()(PDNode* x_var, PDNode* y_var, - const std::string elementwise_type); + const std::string& elementwise_type); PATTERN_DECL_NODE(elementwise_op); PATTERN_DECL_NODE(elementwise_x); @@ -1102,7 +1102,7 @@ struct ElementwiseOp : public PatternBase { ElementwiseOp(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "elementwise") {} - PDNode* operator()(const std::string elementwise_type); + PDNode* operator()(const std::string& elementwise_type); PATTERN_DECL_NODE(elementwise_op); PATTERN_DECL_NODE(elementwise_out); @@ -1118,7 +1118,7 @@ struct ResidualElementwise : public PatternBase { : PatternBase(pattern, name_scope, "residual_elementwise") {} PDNode* operator()(PDNode* op_var, PDNode* residual_var, - const std::string elementwise_type, + const std::string& elementwise_type, bool as_x); PATTERN_DECL_NODE(operator_output); @@ -1127,59 +1127,20 @@ struct ResidualElementwise : public PatternBase { PATTERN_DECL_NODE(elementwise_out); }; -// Transpose op -// Forward pass for transpose. -// transpose_out is a result of the operator. -struct Transpose : public PatternBase { - Transpose(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "transpose2") {} +// General struct for immutable ops: +// reshape, transpose, slice, nearest-interp +// Forward pass for no weights-op. +// immutable_out is a result of the operator. +struct Immutable : public PatternBase { + Immutable(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "immutable") {} - PDNode* operator()(); - PATTERN_DECL_NODE(prev_op); - PATTERN_DECL_NODE(transpose_in); - PATTERN_DECL_NODE(transpose_op); - PATTERN_DECL_NODE(transpose_out); -}; - -// Reshape op -// Forward pass for reshape. -// reshape_out is a result of the operator. -struct Reshape : public PatternBase { - Reshape(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "reshape2") {} - - PDNode* operator()(); - PATTERN_DECL_NODE(prev_op); - PATTERN_DECL_NODE(reshape_in); - PATTERN_DECL_NODE(reshape_op); - PATTERN_DECL_NODE(reshape_out); -}; -// Slice op -// Forward pass for slice. -// slice_out is a result of the operator. -struct Slice : public PatternBase { - Slice(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "slice") {} - - PDNode* operator()(); - PATTERN_DECL_NODE(prev_op); - PATTERN_DECL_NODE(slice_in); - PATTERN_DECL_NODE(slice_op); - PATTERN_DECL_NODE(slice_out); -}; - -// Nearest Interp op -// Forward pass for nearest_interp. -// nearest_interp_out is a result of the operator. -struct NearestInterp : public PatternBase { - NearestInterp(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "nearest_interp") {} - - PDNode* operator()(); + PDNode* operator()(const std::string& immutable_type, + const std::string& input_name); PATTERN_DECL_NODE(prev_op); - PATTERN_DECL_NODE(nearest_interp_in); - PATTERN_DECL_NODE(nearest_interp_op); - PATTERN_DECL_NODE(nearest_interp_out); + PATTERN_DECL_NODE(immutable_in); + PATTERN_DECL_NODE(immutable_op); + PATTERN_DECL_NODE(immutable_out); }; // Matmul op diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index b0d41c16f5e98..26a4478fff683 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -669,165 +669,68 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const { LogQuantizedOpsCounter("prior_box", quantize_prior_box_count); } -void CPUQuantizePass::QuantizeTranspose(Graph* graph) const { +void CPUQuantizePass::QuantizeImmutable(Graph* graph, + const std::string& immutable_type, + const std::string& input_name) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); - patterns::Transpose transpose_pattern{pattern, name_scope_}; - transpose_pattern(); + patterns::Immutable immutable_pattern{pattern, name_scope_}; + immutable_pattern(immutable_type, input_name); - int quantize_transpose_count = 0; + int quantize_immutable_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - VLOG(4) << "Quantize transpose op"; - GET_IR_NODE_FROM_SUBGRAPH(transpose_op, transpose_op, transpose_pattern); + VLOG(4) << "Quantize " + immutable_type + " op"; + GET_IR_NODE_FROM_SUBGRAPH(immutable_op, immutable_op, immutable_pattern); // skip if should not be quantized - if (!platform::HasOpINT8DataType(transpose_op->Op())) { - LogQuantizationDisabled(transpose_op); + if (!platform::HasOpINT8DataType(immutable_op->Op())) { + LogQuantizationDisabled(immutable_op); return; } - GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, transpose_pattern); - GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern); - GET_IR_NODE_FROM_SUBGRAPH(transpose_out, transpose_out, transpose_pattern); + GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, immutable_pattern); + GET_IR_NODE_FROM_SUBGRAPH(immutable_in, immutable_in, immutable_pattern); + GET_IR_NODE_FROM_SUBGRAPH(immutable_out, immutable_out, immutable_pattern); // skip if prev op and next op is not quantized - if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(transpose_out))) { - MarkAndLogCannotQuantizeOp(transpose_op, + if (!IsOpDequantized(prev_op) && !IsOpQuantized(immutable_out)) { + MarkAndLogCannotQuantizeOp(immutable_op, "No other quantizable operators nearby"); return; } - if (!AreScalesPresentForNodes({transpose_in, transpose_out})) { - MarkAndLogCannotQuantizeOp(transpose_op, + if (!AreScalesPresentForNodes({immutable_out})) { + MarkAndLogCannotQuantizeOp(immutable_op, "No scale available for the operator"); return; } bool is_input_unsigned{false}; - auto input_scale = GetScaleValueForNode(transpose_in, &is_input_unsigned); - QuantizeInput( - g, transpose_op, transpose_in, "X", input_scale, is_input_unsigned); + auto input_scale = GetScaleValueForNode(immutable_out, &is_input_unsigned); + + QuantizeInput(g, + immutable_op, + immutable_in, + input_name, + input_scale, + is_input_unsigned); bool is_output_unsigned{false}; auto output_scale = - GetScaleValueForNode(transpose_out, &is_output_unsigned); + GetScaleValueForNode(immutable_out, &is_output_unsigned); DequantizeOutput(g, - transpose_op, - transpose_out, + immutable_op, + immutable_out, "Out", output_scale, is_output_unsigned); - ++quantize_transpose_count; + ++quantize_immutable_count; }; gpd(graph, handler); - AddStatis(quantize_transpose_count); - LogQuantizedOpsCounter("transpose2", quantize_transpose_count); -} - -void CPUQuantizePass::QuantizeReshape(Graph* graph) const { - GraphPatternDetector gpd; - auto pattern = gpd.mutable_pattern(); - patterns::Reshape reshape_pattern{pattern, name_scope_}; - reshape_pattern(); - - int quantize_reshape_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - VLOG(4) << "Quantize reshape op"; - GET_IR_NODE_FROM_SUBGRAPH(reshape_op, reshape_op, reshape_pattern); - - // skip if should not be quantized - if (!platform::HasOpINT8DataType(reshape_op->Op())) { - LogQuantizationDisabled(reshape_op); - return; - } - GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, reshape_pattern); - GET_IR_NODE_FROM_SUBGRAPH(reshape_in, reshape_in, reshape_pattern); - GET_IR_NODE_FROM_SUBGRAPH(reshape_out, reshape_out, reshape_pattern); - - // skip if prev op is not quantized - if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(reshape_out))) { - MarkAndLogCannotQuantizeOp(reshape_op, - "No other quantizable operators nearby"); - return; - } - - if (!AreScalesPresentForNodes({reshape_in, reshape_out})) { - MarkAndLogCannotQuantizeOp(reshape_op, - "No scale available for the operator"); - return; - } - - bool is_input_unsigned{false}; - auto input_scale = GetScaleValueForNode(reshape_in, &is_input_unsigned); - QuantizeInput( - g, reshape_op, reshape_in, "X", input_scale, is_input_unsigned); - - bool is_output_unsigned{false}; - auto output_scale = GetScaleValueForNode(reshape_out, &is_output_unsigned); - DequantizeOutput( - g, reshape_op, reshape_out, "Out", output_scale, is_output_unsigned); - - ++quantize_reshape_count; - }; - - gpd(graph, handler); - AddStatis(quantize_reshape_count); - LogQuantizedOpsCounter("reshape2", quantize_reshape_count); -} - -void CPUQuantizePass::QuantizeSlice(Graph* graph) const { - GraphPatternDetector gpd; - auto pattern = gpd.mutable_pattern(); - patterns::Slice slice_pattern{pattern, name_scope_}; - slice_pattern(); - - int quantize_slice_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - VLOG(4) << "Quantize slice op"; - GET_IR_NODE_FROM_SUBGRAPH(slice_op, slice_op, slice_pattern); - - // skip if should not be quantized - if (!platform::HasOpINT8DataType(slice_op->Op())) { - LogQuantizationDisabled(slice_op); - return; - } - GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, slice_pattern); - GET_IR_NODE_FROM_SUBGRAPH(slice_in, slice_in, slice_pattern); - GET_IR_NODE_FROM_SUBGRAPH(slice_out, slice_out, slice_pattern); - - // skip if prev op and next op is not quantized - if (!IsOpDequantized(prev_op) && !IsOpQuantized(slice_out)) { - MarkAndLogCannotQuantizeOp(slice_op, - "No other quantizable operators nearby"); - return; - } - - if (!AreScalesPresentForNodes({slice_out})) { - MarkAndLogCannotQuantizeOp(slice_op, - "No scale available for the operator"); - return; - } - - bool is_input_unsigned{false}; - auto input_scale = GetScaleValueForNode(slice_out, &is_input_unsigned); - QuantizeInput( - g, slice_op, slice_in, "Input", input_scale, is_input_unsigned); - - bool is_output_unsigned{false}; - auto output_scale = GetScaleValueForNode(slice_out, &is_output_unsigned); - DequantizeOutput( - g, slice_op, slice_out, "Out", output_scale, is_output_unsigned); - - ++quantize_slice_count; - }; - - gpd(graph, handler); - AddStatis(quantize_slice_count); - LogQuantizedOpsCounter("slice", quantize_slice_count); + AddStatis(quantize_immutable_count); + LogQuantizedOpsCounter(immutable_type, quantize_immutable_count); } void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { @@ -915,7 +818,7 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const { } void CPUQuantizePass::QuantizeElementwise( - Graph* graph, const std::string elementwise_type) const { + Graph* graph, const std::string& elementwise_type) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); patterns::ElementwiseOp elementwise_pattern{pattern, name_scope_}; @@ -1212,71 +1115,6 @@ void CPUQuantizePass::QuantizeFusionLSTM(Graph* graph) const { LogQuantizedOpsCounter("fusion_lstm", quantize_count); } -void CPUQuantizePass::QuantizeNearestInterp(Graph* graph) const { - GraphPatternDetector gpd; - auto pattern = gpd.mutable_pattern(); - patterns::NearestInterp nearest_interp_pattern{pattern, name_scope_}; - nearest_interp_pattern(); - - int quantize_nearest_interp_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - VLOG(4) << "Quantize nearest_interp op"; - GET_IR_NODE_FROM_SUBGRAPH( - nearest_interp_op, nearest_interp_op, nearest_interp_pattern); - - // skip if should not be quantized - if (!platform::HasOpINT8DataType(nearest_interp_op->Op())) { - LogQuantizationDisabled(nearest_interp_op); - return; - } - GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, nearest_interp_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - nearest_interp_in, nearest_interp_in, nearest_interp_pattern); - GET_IR_NODE_FROM_SUBGRAPH( - nearest_interp_out, nearest_interp_out, nearest_interp_pattern); - - // skip if prev op and next op is not quantized - if (!(IsOpDequantized(prev_op)) && !(IsOpQuantized(nearest_interp_out))) { - MarkAndLogCannotQuantizeOp(nearest_interp_op, - "No other quantizable operators nearby"); - return; - } - - if (!AreScalesPresentForNodes({nearest_interp_in, nearest_interp_out})) { - MarkAndLogCannotQuantizeOp(nearest_interp_op, - "No scale available for the operator"); - return; - } - - bool is_input_unsigned{false}; - auto input_scale = - GetScaleValueForNode(nearest_interp_in, &is_input_unsigned); - QuantizeInput(g, - nearest_interp_op, - nearest_interp_in, - "X", - input_scale, - is_input_unsigned); - - bool is_output_unsigned{false}; - auto output_scale = - GetScaleValueForNode(nearest_interp_out, &is_output_unsigned); - DequantizeOutput(g, - nearest_interp_op, - nearest_interp_out, - "Out", - output_scale, - is_output_unsigned); - - ++quantize_nearest_interp_count; - }; - - gpd(graph, handler); - AddStatis(quantize_nearest_interp_count); - LogQuantizedOpsCounter("nearest_interp", quantize_nearest_interp_count); -} - void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Quantizing the graph."; PADDLE_ENFORCE_NOT_NULL( @@ -1293,18 +1131,19 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizePool(graph); QuantizeConcat(graph); QuantizePriorBox(graph); - QuantizeTranspose(graph); QuantizeFc(graph); - QuantizeReshape(graph); QuantizeMatmul(graph); + QuantizeImmutable(graph, "reshape2", "X"); + QuantizeImmutable(graph, "transpose2", "X"); + QuantizeImmutable(graph, "slice", "Input"); + QuantizeImmutable(graph, "nearest_interp", "X"); + QuantizeImmutable(graph, "nearest_interp_v2", "X"); QuantizeElementwise(graph, "elementwise_add"); QuantizeElementwise(graph, "elementwise_mul"); QuantizeElementwise(graph, "elementwise_sub"); QuantizeFusionGru(graph); QuantizeMultiGru(graph); QuantizeFusionLSTM(graph); - QuantizeSlice(graph); - QuantizeNearestInterp(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index a880907402b3c..56909b7fe7fb5 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -54,16 +54,15 @@ class CPUQuantizePass : public FusePassBase { void QuantizePool(Graph* graph) const; void QuantizeConcat(Graph* graph) const; void QuantizePriorBox(Graph* graph) const; - void QuantizeTranspose(Graph* graph) const; - void QuantizeReshape(Graph* graph) const; void QuantizeMatmul(Graph* graph) const; void QuantizeElementwise(Graph* graph, - const std::string elementwise_type) const; + const std::string& elementwise_type) const; void QuantizeFusionGru(Graph* graph) const; void QuantizeMultiGru(Graph* graph) const; void QuantizeFusionLSTM(Graph* graph) const; - void QuantizeSlice(Graph* graph) const; - void QuantizeNearestInterp(Graph* graph) const; + void QuantizeImmutable(Graph* graph, + const std::string& immutable_type, + const std::string& input_name) const; void QuantizeInput(Graph* g, Node* op, diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 4fa79f6a87ca8..322aa22c6ad14 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -550,55 +550,29 @@ void TestImmutableOpWithManyOutputs(const std::string tested_op) { SCALE * S8_MAX); } -TEST(CpuQuantizePass, reshape2) { TestImmutableOp("reshape2"); } +const std::vector immutables = { + "reshape2", "transpose2", "slice", "nearest_interp", "nearest_interp_v2"}; -TEST(CpuQuantizePass, reshape2BetweenNonQuantizedOp) { - TestImmutableOpBetweenNonQuantizedOp("reshape2"); -} - -TEST(CpuQuantizePass, reshape2WithManyOutputs) { - TestImmutableOpWithManyOutputs("reshape2"); -} - -TEST(CpuQuantizePass, transpose2) { TestImmutableOp("transpose2"); } - -TEST(CpuQuantizePass, transpose2BetweenNonQuantizedOp) { - TestImmutableOpBetweenNonQuantizedOp("transpose2"); -} - -TEST(CpuQuantizePass, transpose2WithManyOutputs) { - TestImmutableOpWithManyOutputs("transpose2"); -} - -TEST(CpuQuantizePass, slice) { TestImmutableOp("slice"); } - -TEST(CpuQuantizePass, sliceBetweenNonQuantizedOp) { - TestImmutableOpBetweenNonQuantizedOp("slice"); -} - -TEST(CpuQuantizePass, sliceWithManyOutputs) { - TestImmutableOpWithManyOutputs("slice"); -} +class TestImmutables : public testing::TestWithParam {}; -TEST(CpuQuantizePass, nearestInterp) { TestImmutableOp("nearest_interp"); } - -TEST(CpuQuantizePass, nearestInterpBetweenNonQuantizedOp) { - TestImmutableOpBetweenNonQuantizedOp("nearest_interp"); -} +TEST_P(TestImmutables, immutable_basic) { TestImmutableOp(GetParam()); } -TEST(CpuQuantizePass, nearestInterpWithManyOutputs) { - TestImmutableOpWithManyOutputs("nearest_interp"); +TEST_P(TestImmutables, immutable_between_non_quantized) { + TestImmutableOpBetweenNonQuantizedOp(GetParam()); } -TEST(CpuQuantizePass, nearestInterpV2) { TestImmutableOp("nearest_interp_v2"); } - -TEST(CpuQuantizePass, nearestInterpV2BetweenNonQuantizedOp) { - TestImmutableOpBetweenNonQuantizedOp("nearest_interp_v2"); +TEST_P(TestImmutables, immutable_many_outputs) { + TestImmutableOpWithManyOutputs(GetParam()); } -TEST(CpuQuantizePass, nearestInterpV2WithManyOutputs) { - TestImmutableOpWithManyOutputs("nearest_interp_v2"); -} +INSTANTIATE_TEST_CASE_P( + CpuQuantizePass, + TestImmutables, + testing::ValuesIn(immutables), + [](const ::testing::TestParamInfo& info) { + std::string name = info.param; + return name; + }); static const std::initializer_list variable_names_matmul = { "a", "b", "c", "d", "e", "f"}; @@ -735,7 +709,7 @@ TEST_P(TestElementwises, elementwise_unsigned_and_signed_input) { } INSTANTIATE_TEST_CASE_P( - Elementwises, + CpuQuantizePass, TestElementwises, testing::ValuesIn(elementwises), [](const ::testing::TestParamInfo& info) { From 01c4ad80db1b55b578a8dbda8905266e1bba54bb Mon Sep 17 00:00:00 2001 From: piotrekobi <48731682+piotrekobi@users.noreply.github.com> Date: Tue, 5 Jul 2022 13:24:15 +0200 Subject: [PATCH 064/250] Fix for ernie3.0 int8 (#43992) * Fix for ernie3.0 int8 * Move changes above comment --- paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 1755b0f208207..79551b6d59a2c 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -170,6 +170,9 @@ class FCPrimitiveFactory { // In case of 2 dims, we set the only possible format, nc if (dim_num == 2) { out->set_format(MKLDNNMemoryFormat::nc); + out->set_mem_desc({phi::vectorize(out->dims()), + platform::MKLDNNGetDataType(), + out->format()}); // In case of 3 dims, we generate a format that is based on number // of output dims and the layout of input format (nchw or nhwc). } else if (dim_num == 3) { @@ -185,9 +188,6 @@ class FCPrimitiveFactory { } else { out->set_format(in_format); } - out->set_mem_desc({phi::vectorize(out->dims()), - platform::MKLDNNGetDataType(), - out->format()}); } void UpdateDataPointers(const ExecutionContext& ctx, From d90c39e937db998a562242ceeb8c0d0a726f7ef1 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Tue, 5 Jul 2022 19:29:12 +0800 Subject: [PATCH 065/250] [Windows CI] open all available ut in windows-inference pipeline (#43839) * open ut in windows-inference;test=document_fix;test=windows_ci_inference * test=document_fix;test=windows_ci_inference;disable failed tests temporarily in windows-inference --- tools/windows/run_unittests.sh | 153 +++++++++++++++++++++++++++++---- 1 file changed, 135 insertions(+), 18 deletions(-) diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index a1826220095b5..23a0b4d32828f 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -79,14 +79,126 @@ disable_wingpu11_test="^test_autograd_functional_dynamic$|\ # /*==========Fixed Disabled Windows CUDA11.x inference_api_test(PR-CI-Windows-Inference) unittests=============*/ -disable_win_inference_api_test="^trt_quant_int8_yolov3_r50_test$|\ +disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_trt_dynamic_shape_ernie$|\ ^test_trt_dynamic_shape_ernie_fp16_ser_deser$|\ ^lite_resnet50_test$|\ ^test_trt_dynamic_shape_transformer_prune$|\ ^lite_mul_model_test$|\ ^trt_split_converter_test$|\ -^paddle_infer_api_copy_tensor_tester$" +^paddle_infer_api_copy_tensor_tester$|\ +^test_tensorrt_engine_op$|\ +^test_tensorrt_engine$|\ +^test_trt_deformable_conv$|\ +^test_imperative_triple_grad$|\ +^test_full_name_usage$|\ +^test_trt_convert_unary$|\ +^test_eigh_op$|\ +^test_fc_op$|\ +^test_stack_op$|\ +^trt_split_converter_test$|\ +^paddle_infer_api_copy_tensor_tester$|\ +^test_var_base$|\ +^test_einsum_v2$|\ +^test_tensor_scalar_type_promotion_static$|\ +^test_matrix_power_op$|\ +^test_deformable_conv_v1_op$|\ +^zero_copy_tensor_test$|\ +^test_where_index$|\ +^test_custom_grad_input$|\ +^test_conv3d_transpose_op$|\ +^test_conv_elementwise_add_act_fuse_pass$|\ +^test_conv_eltwiseadd_bn_fuse_pass$|\ +^test_custom_relu_op_setup$|\ +^test_conv3d_transpose_part2_op$|\ +^test_deform_conv2d$|\ +^test_matmul_op$|\ +^test_basic_api_transformation$|\ +^test_deformable_conv_op$|\ +^test_variable$|\ +^test_conv_bias_mkldnn_fuse_pass_cc$|\ +^test_conv_batch_norm_mkldnn_fuse_pass$|\ +^test_compute_propagate_scales_mkldnn_pass$|\ +^test_cpu_quantize_pass$|\ +^test_cpu_quantize_squash_pass$|\ +^op_tester$|\ +^test_analyzer$|\ +^infer_io_utils_tester$|\ +^test_paddle_inference_api$|\ +^test_mkldnn_quantizer$|\ +^test_mkldnn_conv_hard_sigmoid_fuse_pass$|\ +^test_mkldnn_conv_hard_swish_fuse_pass$|\ +^test_conv_act_mkldnn_fuse_pass$|\ +^test_matmul_scale_fuse_pass$|\ +^test_addmm_op$|\ +^test_inverse_op$|\ +^test_set_value_op$|\ +^test_fused_multihead_matmul_op$|\ +^test_cudnn_bn_add_relu$|\ +^test_cond$|\ +^test_conv_bn_fuse_pass$|\ +^test_graph_khop_sampler$|\ +^test_gru_rnn_op$|\ +^test_masked_select_op$|\ +^test_ir_fc_fuse_pass$|\ +^test_fc_elementwise_layernorm_fuse_pass$|\ +^test_linalg_pinv_op$|\ +^test_math_op_patch_var_base$|\ +^test_slice$|\ +^test_conv_elementwise_add_fuse_pass$|\ +^test_executor_and_mul$|\ +^test_op_converter$|\ +^test_analyzer_int8_resnet50$|\ +^test_analyzer_int8_mobilenetv1$|\ +^test_trt_conv_pass$|\ +^test_analysis_predictor$|\ +^test_roll_op$|\ +^test_lcm$|\ +^test_elementwise_floordiv_op$|\ +^test_autograd_functional_dynamic$|\ +^test_corr$|\ +^test_trt_convert_deformable_conv$|\ +^test_conv_elementwise_add2_act_fuse_pass$|\ +^test_tensor_scalar_type_promotion_dynamic$|\ +^test_api_impl$|\ +^test_model$|\ +^test_py_reader_combination$|\ +^test_trt_convert_flatten$|\ +^test_py_reader_push_pop$|\ +^test_parallel_executor_feed_persistable_var$|\ +^test_parallel_executor_inference_feed_partial_data$|\ +^test_parallel_ssa_graph_inference_feed_partial_data$|\ +^test_reader_reset$|\ +^test_parallel_executor_seresnext_base_gpu$|\ +^test_py_reader_pin_memory$|\ +^test_multiprocess_dataloader_iterable_dataset_dynamic$|\ +^test_multiprocess_dataloader_iterable_dataset_static$|\ +^test_add_reader_dependency$|\ +^test_compat$|\ +^test_decoupled_py_reader$|\ +^test_generator_dataloader$|\ +^test_py_reader_using_executor$|\ +^test_imperative_static_runner_while$|\ +^test_dataloader_keep_order$|\ +^test_dataloader_unkeep_order$|\ +^test_sync_batch_norm_op$|\ +^test_fuse_bn_act_pass$|\ +^test_fuse_bn_add_act_pass$|\ +^test_decoupled_py_reader_data_check$|\ +^test_parallel_dygraph_sync_batch_norm$|\ +^test_dataloader_early_reset$|\ +^test_fleet_base_single$|\ +^test_sequence_pool$|\ +^test_simplify_with_basic_ops_pass_autoscan$|\ +^test_trt_activation_pass$|\ +^test_trt_convert_hard_swish$|\ +^test_trt_convert_leaky_relu$|\ +^test_trt_convert_multihead_matmul$|\ +^test_trt_convert_prelu$|\ +^test_trt_fc_fuse_quant_dequant_pass$|\ +^test_unsqueeze2_eltwise_fuse_pass$|\ +^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\ +^test_parallel_executor_seresnext_with_reduce_gpu$" # /*==========Fixed Disabled Windows CPU OPENBLAS((PR-CI-Windows-OPENBLAS)) unittests==============================*/ @@ -247,6 +359,11 @@ function run_unittest_gpu() { echo "********These unittests run $parallel_job job each time with 1 GPU**********" echo "************************************************************************" export CUDA_VISIBLE_DEVICES=0 + + if nvcc --version | grep 11.2; then + disable_wingpu_test=${disable_win_inference_test} + fi + tmpfile=$tmp_dir/$RANDOM (ctest -R "$test_case" -E "$disable_ut_quickly|$disable_wingpu_test|$disable_win_trt_test|$long_time_test" -LE "${nightly_label}" --output-on-failure -C Release -j $parallel_job | tee $tmpfile ) & wait; @@ -335,22 +452,22 @@ set +e export FLAGS_call_stack_level=2 -if nvcc --version | grep 11.2; then - echo "Only test added_ut and inference_api_test temporarily when running in CI-Windows-inference of CUDA 11.2." - export CUDA_VISIBLE_DEVICES=0 - tmpfile=$tmp_dir/$RANDOM - inference_api_test=^$(ls "paddle/fluid/inference/tests/api" | sed -n 's/\.exe$//pg' | awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' | sed 's/|\^$//g') - (ctest -R "$inference_api_test" -E "$disable_win_inference_api_test" --output-on-failure -C Release -j 2 | tee $tmpfile ) & - wait; - collect_failed_tests - set -e - rm -f $tmp_dir/* - if [[ "$failed_test_lists" != "" ]]; then - unittests_retry - show_ut_retry_result - fi - exit 0; -fi +# if nvcc --version | grep 11.2; then +# echo "Only test added_ut and inference_api_test temporarily when running in CI-Windows-inference of CUDA 11.2." +# export CUDA_VISIBLE_DEVICES=0 +# tmpfile=$tmp_dir/$RANDOM +# inference_api_test=^$(ls "paddle/fluid/inference/tests/api" | sed -n 's/\.exe$//pg' | awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' | sed 's/|\^$//g') +# (ctest -R "$inference_api_test" -E "$disable_win_inference_api_test" --output-on-failure -C Release -j 2 | tee $tmpfile ) & +# wait; +# collect_failed_tests +# set -e +# rm -f $tmp_dir/* +# if [[ "$failed_test_lists" != "" ]]; then +# unittests_retry +# show_ut_retry_result +# fi +# exit 0; +# fi if [ "${WITH_GPU:-OFF}" == "ON" ];then From 29b55009f78eb812fd13223975fd1a1aa7fafad0 Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Tue, 5 Jul 2022 15:19:42 +0200 Subject: [PATCH 066/250] Persuading more efficient memory format to be preferred (#44078) * - blind shot fix * - workaround * - compilation fix * - Hack --- paddle/fluid/platform/mkldnn_helper.h | 16 +++++++++++----- paddle/fluid/platform/mkldnn_utils.h | 16 +++++++++++----- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 83fd353f54dd6..0e97a68edfc9d 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -233,15 +233,21 @@ inline dnnl::memory::format_tag GetMKLDNNFormat(dnnl::memory::desc mem_desc) { if (inner_nblks == 0) { if (strides[0] >= strides[1] && strides[1] >= strides[2] && strides[2] >= strides[3]) { - return dnnl::memory::format_tag::nchw; + return dnnl::memory::format_tag::abcd; + } else if (strides[2] >= strides[3] && strides[3] >= strides[1] && + strides[1] >= strides[0]) { + return dnnl::memory::format_tag::cdba; + } else if (strides[0] >= strides[2] && strides[2] >= strides[3] && + strides[3] >= strides[1]) { + return dnnl::memory::format_tag::acdb; + } else if (strides[0] >= strides[1] && strides[1] >= strides[3] && + strides[3] >= strides[2]) { + return dnnl::memory::format_tag::abdc; } else if (strides[2] >= strides[3] && strides[3] >= strides[1] && strides[1] >= strides[0]) { return dnnl::memory::format_tag::cdba; - } else if (strides[3] >= strides[2] && strides[2] >= strides[0] && - strides[0] >= strides[1]) { - return dnnl::memory::format_tag::dcab; } else { - return dnnl::memory::format_tag::nhwc; + return dnnl::memory::format_tag::dcab; } } else if (inner_nblks == 1) { if (inner_blks[0] == 16 && inner_idxs[0] == 1) { diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h index 12c48ed412428..38470d18f4623 100644 --- a/paddle/fluid/platform/mkldnn_utils.h +++ b/paddle/fluid/platform/mkldnn_utils.h @@ -51,15 +51,21 @@ inline dnnl::memory::format_tag GetMKLDNNFormat(dnnl::memory::desc mem_desc) { if (inner_nblks == 0) { if (strides[0] >= strides[1] && strides[1] >= strides[2] && strides[2] >= strides[3]) { - return dnnl::memory::format_tag::nchw; + return dnnl::memory::format_tag::abcd; + } else if (strides[2] >= strides[3] && strides[3] >= strides[1] && + strides[1] >= strides[0]) { + return dnnl::memory::format_tag::cdba; + } else if (strides[0] >= strides[2] && strides[2] >= strides[3] && + strides[3] >= strides[1]) { + return dnnl::memory::format_tag::acdb; + } else if (strides[0] >= strides[1] && strides[1] >= strides[3] && + strides[3] >= strides[2]) { + return dnnl::memory::format_tag::abdc; } else if (strides[2] >= strides[3] && strides[3] >= strides[1] && strides[1] >= strides[0]) { return dnnl::memory::format_tag::cdba; - } else if (strides[3] >= strides[2] && strides[2] >= strides[0] && - strides[0] >= strides[1]) { - return dnnl::memory::format_tag::dcab; } else { - return dnnl::memory::format_tag::nhwc; + return dnnl::memory::format_tag::dcab; } } else if (inner_nblks == 1) { if (inner_blks[0] == 16 && inner_idxs[0] == 1) { From a0dc361cbc9188852885ec72a4c2287c63818028 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Tue, 5 Jul 2022 22:53:06 +0800 Subject: [PATCH 067/250] Dataloader add custom device support (#44013) * Dataloader add custom device support * update test=document_fix --- paddle/fluid/memory/memcpy.cc | 22 ++ .../fluid/operators/reader/buffered_reader.cc | 76 +++++++ .../fluid/operators/reader/buffered_reader.h | 12 +- paddle/phi/backends/device_guard.h | 4 + paddle/phi/backends/device_manager.cc | 6 +- paddle/phi/backends/event.cc | 8 +- paddle/phi/backends/event.h | 1 + paddle/phi/backends/stream.cc | 7 +- python/paddle/fluid/tests/CMakeLists.txt | 1 + .../fluid/tests/custom_runtime/CMakeLists.txt | 3 + .../fluid/tests/custom_runtime/__init__.py | 13 ++ .../custom_runtime/custom_cpu_runtime.cc | 215 ++++++++++++++++++ .../tests/custom_runtime/custom_cpu_setup.py | 82 +++++++ .../test_custom_device_data_loader.py | 66 ++++++ 14 files changed, 509 insertions(+), 7 deletions(-) create mode 100644 python/paddle/fluid/tests/custom_runtime/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/custom_runtime/__init__.py create mode 100644 python/paddle/fluid/tests/custom_runtime/custom_cpu_runtime.cc create mode 100644 python/paddle/fluid/tests/custom_runtime/custom_cpu_setup.py create mode 100644 python/paddle/fluid/tests/custom_runtime/test_custom_device_data_loader.py diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index f09cbfc3bef16..05f46dd396023 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -1442,6 +1442,28 @@ void Copy(phi::Place dst_place, return Copy(place_dst, dst, place_src, src, num); } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT + dst_place.GetType() == phi::AllocationType::CUSTOM) { + platform::CustomPlace place_dst(dst_place.GetDeviceType(), + dst_place.GetDeviceId()); + platform::CPUPlace place_src; + return Copy(place_dst, dst, place_src, src, num, nullptr); + } else if (src_place.GetType() == phi::AllocationType::CUSTOM && + dst_place.GetType() == phi::AllocationType::CPU) { + platform::CustomPlace place_src(src_place.GetDeviceType(), + src_place.GetDeviceId()); + platform::CPUPlace place_dst; + return Copy(place_dst, dst, place_src, src, num, nullptr); + } else if (src_place.GetType() == phi::AllocationType::CUSTOM && + dst_place.GetType() == phi::AllocationType::CUSTOM) { + platform::CustomPlace place_src(src_place.GetDeviceType(), + src_place.GetDeviceId()); + platform::CustomPlace place_dst(dst_place.GetDeviceType(), + dst_place.GetDeviceId()); + return Copy(place_dst, dst, place_src, src, num, nullptr); + } +#endif } // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace). diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index f3913a62b29d1..a36d51e42f5c8 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -19,6 +19,9 @@ #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/backends/device_guard.h" +#include "paddle/phi/backends/device_manager.h" + namespace paddle { namespace operators { namespace reader { @@ -105,11 +108,30 @@ BufferedReader::BufferedReader( } #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (platform::is_custom_place(place_)) { + auto stream = ((platform::CustomDeviceContext + *)(platform::DeviceContextPool::Instance().Get(place_))) + ->stream(); + custom_device_compute_stream_ = + std::make_shared(place_, stream); + + custom_device_events_.resize(buffer_size); + for (auto &event : custom_device_events_) { + event = std::make_shared(); + event->Init(place_); + } + custom_device_stream_ = std::make_shared(); + custom_device_stream_->Init(place_); + } +#endif + cpu_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size); npu_buffer_.resize(buffer_size); mlu_buffer_.resize(buffer_size); xpu_buffer_.resize(buffer_size); + custom_device_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); } @@ -410,6 +432,58 @@ void BufferedReader::ReadAsync(size_t i) { platform::XPUStreamSync(stream_.get()); } #endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (platform::is_custom_place(place_)) { + TensorVec &custom_device = custom_device_buffer_[i]; + if (custom_device.empty()) { + custom_device.resize(cpu.size()); + } else { + PADDLE_ENFORCE_EQ(custom_device.size(), + cpu.size(), + platform::errors::InvalidArgument( + "Input tensor number on CustomDevice and CPU " + "devices are not matched. " + "The number on CustomDevice is %d, on CPU is %d", + custom_device.size(), + cpu.size())); + } + + std::vector custom_device_ptrs; + custom_device_ptrs.reserve(cpu.size()); + for (size_t i = 0; i < cpu.size(); ++i) { + custom_device[i].Resize(cpu[i].dims()); + custom_device[i].set_layout(cpu[i].layout()); + custom_device_ptrs.emplace_back( + custom_device[i].mutable_data(place_, cpu[i].type())); + } + + phi::DeviceManager::SetDevice(place_); + phi::DeviceManager::GetDeviceWithPlace(place_)->RecordEvent( + custom_device_events_[i].get(), custom_device_compute_stream_.get()); + phi::DeviceManager::GetDeviceWithPlace(place_)->StreamWaitEvent( + custom_device_stream_.get(), custom_device_events_[i].get()); + + platform::RecordEvent record_event("BufferedReader:MemoryCopy", + platform::TracerEventType::UserDefined, + 1); + for (size_t i = 0; i < cpu.size(); ++i) { + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto custom_device_ptr = custom_device_ptrs[i]; + auto size = + cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + if ((platform::is_custom_place(cpu_place))) { + memory::Copy(place_, custom_device_ptr, cpu_place, cpu_ptr, size); + custom_device_stream_->Synchronize(); + } else { + memory::Copy(place_, custom_device_ptr, cpu_place, cpu_ptr, size); + } + custom_device[i].set_lod(cpu[i].lod()); + } + custom_device_stream_->Synchronize(); + } +#endif return i; })); } @@ -449,6 +523,8 @@ void BufferedReader::ReadNextImpl(std::vector *out) { *out = std::move(mlu_buffer_[i]); } else if (platform::is_xpu_place(place_)) { *out = std::move(xpu_buffer_[i]); + } else if (platform::is_custom_place(place_)) { + *out = std::move(custom_device_buffer_[i]); } else { *out = std::move(cpu_buffer_[i]); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 94c2fb12486bc..06aaf4c12057d 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -37,7 +37,10 @@ #include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h" #endif - +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/backends/event.h" +#include "paddle/phi/backends/stream.h" +#endif namespace paddle { namespace operators { namespace reader { @@ -82,6 +85,7 @@ class BufferedReader : public framework::DecoratedReader { std::vector npu_buffer_; std::vector mlu_buffer_; std::vector xpu_buffer_; + std::vector custom_device_buffer_; size_t prev_pos_{-1UL}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t compute_stream_; @@ -106,6 +110,12 @@ class BufferedReader : public framework::DecoratedReader { std::shared_ptr stream_; std::vector> events_; #endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE + std::shared_ptr custom_device_compute_stream_; + std::shared_ptr custom_device_stream_; + std::vector> custom_device_events_; +#endif }; } // namespace reader diff --git a/paddle/phi/backends/device_guard.h b/paddle/phi/backends/device_guard.h index eb14236d251b3..668951f8a1c98 100644 --- a/paddle/phi/backends/device_guard.h +++ b/paddle/phi/backends/device_guard.h @@ -13,6 +13,8 @@ // limitations under the License. #pragma once +#ifdef PADDLE_WITH_CUSTOM_DEVICE + #include "paddle/phi/backends/device_manager.h" namespace phi { @@ -44,3 +46,5 @@ class DeviceGuard { }; } // namespace phi + +#endif diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc index 35339aed0f3e1..ffaf42a0cf4e6 100644 --- a/paddle/phi/backends/device_manager.cc +++ b/paddle/phi/backends/device_manager.cc @@ -394,8 +394,10 @@ DeviceManager& DeviceManager::Instance() { } void DeviceManager::Clear() { - Instance().device_map_.clear(); - Instance().device_impl_map_.clear(); + // TODO(wangran16): fix coredump when using npu plugin + + // Instance().device_map_.clear(); + // Instance().device_impl_map_.clear(); } std::vector ListAllLibraries(const std::string& library_dir) { diff --git a/paddle/phi/backends/event.cc b/paddle/phi/backends/event.cc index 43077d280f360..b594d919abc18 100644 --- a/paddle/phi/backends/event.cc +++ b/paddle/phi/backends/event.cc @@ -35,7 +35,11 @@ Event::~Event() { Destroy(); } bool Event::Init(const Place& place, Flag flags) { place_ = place; - DeviceGuard guard(place_); + device_ = phi::DeviceManager::GetDeviceWithPlace(place); + + // note(wangran16): bind device to the current thread. fix npu plugin null + // context bug. + phi::DeviceManager::SetDevice(place_); device_->CreateEvent(this, flags); VLOG(3) << "Init Event: " << event_ << ", place: " << place_ << ", flag:" << static_cast(flags); @@ -45,7 +49,7 @@ bool Event::Init(const Place& place, Flag flags) { void Event::Destroy() { if (own_data_) { - DeviceGuard guard(place_); + phi::DeviceManager::SetDevice(place_); device_->DestroyEvent(this); own_data_ = false; } diff --git a/paddle/phi/backends/event.h b/paddle/phi/backends/event.h index 0866adcf39afa..8de223528f8fd 100644 --- a/paddle/phi/backends/event.h +++ b/paddle/phi/backends/event.h @@ -36,6 +36,7 @@ class Event { Interprocess = 0x4, }; + Event() = default; // For compatible Event(const Place& place, event_t event); ~Event(); diff --git a/paddle/phi/backends/stream.cc b/paddle/phi/backends/stream.cc index f8b15bdbd9e63..bad57c5238ec8 100644 --- a/paddle/phi/backends/stream.cc +++ b/paddle/phi/backends/stream.cc @@ -40,7 +40,10 @@ bool Stream::Init(const Place& place, const Flag& flag) { place_ = place; device_ = phi::DeviceManager::GetDeviceWithPlace(place); - DeviceGuard guard(place_); + + // note(wangran16): bind device to the current thread. fix npu plugin null + // context bug. + phi::DeviceManager::SetDevice(place_); device_->CreateStream(this, priority, flag); callback_manager_.reset(new CallbackManager(this)); @@ -80,7 +83,7 @@ void Stream::WaitCallback() const { callback_manager_->Wait(); } void Stream::Destroy() { if (own_data_) { - DeviceGuard guard(place_); + phi::DeviceManager::SetDevice(place_); device_->DestroyStream(this); own_data_ = false; } diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt index 6acee6dc11c89..92e29202b28b8 100644 --- a/python/paddle/fluid/tests/CMakeLists.txt +++ b/python/paddle/fluid/tests/CMakeLists.txt @@ -12,5 +12,6 @@ add_subdirectory(unittests) add_subdirectory(book) add_subdirectory(custom_op) add_subdirectory(custom_kernel) +add_subdirectory(custom_runtime) set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt new file mode 100644 index 0000000000000..acd441c867787 --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt @@ -0,0 +1,3 @@ +if(WITH_CUSTOM_DEVICE) + py_test(test_custom_device_data_loader SRCS test_custom_device_data_loader.py) +endif() diff --git a/python/paddle/fluid/tests/custom_runtime/__init__.py b/python/paddle/fluid/tests/custom_runtime/__init__.py new file mode 100644 index 0000000000000..97043fd7ba688 --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/fluid/tests/custom_runtime/custom_cpu_runtime.cc b/python/paddle/fluid/tests/custom_runtime/custom_cpu_runtime.cc new file mode 100644 index 0000000000000..18762625c0fe2 --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/custom_cpu_runtime.cc @@ -0,0 +1,215 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/phi/backends/device_ext.h" + +#define MEMORY_FRACTION 0.5f + +C_Status Init() { return C_SUCCESS; } + +C_Status InitDevice(const C_Device device) { return C_SUCCESS; } + +C_Status SetDevice(const C_Device device) { return C_SUCCESS; } + +C_Status GetDevice(const C_Device device) { + device->id = 0; + return C_SUCCESS; +} + +C_Status DestroyDevice(const C_Device device) { return C_SUCCESS; } + +C_Status Finalize() { return C_SUCCESS; } + +C_Status GetDevicesCount(size_t *count) { + *count = 1; + return C_SUCCESS; +} + +C_Status GetDevicesList(size_t *devices) { + devices[0] = 0; + return C_SUCCESS; +} + +C_Status MemCpy(const C_Device device, + void *dst, + const void *src, + size_t size) { + memcpy(dst, src, size); + return C_SUCCESS; +} + +C_Status AsyncMemCpy(const C_Device device, + C_Stream stream, + void *dst, + const void *src, + size_t size) { + memcpy(dst, src, size); + return C_SUCCESS; +} + +C_Status MemCpyP2P(const C_Device dst_device, + const C_Device src_device, + void *dst, + const void *src, + size_t size) { + memcpy(dst, src, size); + return C_SUCCESS; +} + +C_Status AsyncMemCpyP2P(const C_Device dst_device, + const C_Device src_device, + C_Stream stream, + void *dst, + const void *src, + size_t size) { + memcpy(dst, src, size); + return C_SUCCESS; +} + +C_Status Allocate(const C_Device device, void **ptr, size_t size) { + auto data = malloc(size); + if (data) { + *ptr = data; + return C_SUCCESS; + } else { + *ptr = nullptr; + } + return C_FAILED; +} + +C_Status Deallocate(const C_Device device, void *ptr, size_t size) { + free(ptr); + return C_SUCCESS; +} + +C_Status CreateStream(const C_Device device, C_Stream *stream) { + stream = nullptr; + return C_SUCCESS; +} + +C_Status DestroyStream(const C_Device device, C_Stream stream) { + return C_SUCCESS; +} + +C_Status CreateEvent(const C_Device device, C_Event *event) { + return C_SUCCESS; +} + +C_Status RecordEvent(const C_Device device, C_Stream stream, C_Event event) { + return C_SUCCESS; +} + +C_Status DestroyEvent(const C_Device device, C_Event event) { + return C_SUCCESS; +} + +C_Status SyncDevice(const C_Device device) { return C_SUCCESS; } + +C_Status SyncStream(const C_Device device, C_Stream stream) { + return C_SUCCESS; +} + +C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; } + +C_Status StreamWaitEvent(const C_Device device, + C_Stream stream, + C_Event event) { + return C_SUCCESS; +} + +C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; } + +C_Status DeviceMemStats(const C_Device device, + size_t *total_memory, + size_t *free_memory) { + float memusage; + FILE *fp; + char buffer[1024]; + size_t byte_read; + char *pos; + + fp = fopen("/proc/meminfo", "r"); + byte_read = fread(buffer, 1, sizeof(buffer), fp); + fclose(fp); + buffer[byte_read] = '\0'; + pos = strstr(buffer, "MemTotal:"); + sscanf(pos, "MemTotal: %lu kB", total_memory); + pos = strstr(pos, "MemFree:"); + sscanf(pos, "MemFree: %lu kB", free_memory); + *total_memory = *total_memory * 1024; + *free_memory = *free_memory * 1024; + *free_memory = *free_memory * MEMORY_FRACTION; + + return C_SUCCESS; +} + +C_Status DeviceMinChunkSize(const C_Device device, size_t *size) { + *size = 512; + return C_SUCCESS; +} + +void InitPlugin(CustomRuntimeParams *params) { + PADDLE_CUSTOM_RUNTIME_CHECK_VERSION(params); + params->device_type = "custom_cpu"; + params->sub_device_type = "v0.1"; + + memset(reinterpret_cast(params->interface), + 0, + sizeof(C_DeviceInterface)); + + params->interface->initialize = Init; + params->interface->finalize = Finalize; + + params->interface->init_device = InitDevice; + params->interface->set_device = SetDevice; + params->interface->get_device = GetDevice; + params->interface->deinit_device = DestroyDevice; + + params->interface->create_stream = CreateStream; + params->interface->destroy_stream = DestroyStream; + + params->interface->create_event = CreateEvent; + params->interface->destroy_event = DestroyEvent; + params->interface->record_event = RecordEvent; + + params->interface->synchronize_device = SyncDevice; + params->interface->synchronize_stream = SyncStream; + params->interface->synchronize_event = SyncEvent; + params->interface->stream_wait_event = StreamWaitEvent; + + params->interface->memory_copy_h2d = MemCpy; + params->interface->memory_copy_d2d = MemCpy; + params->interface->memory_copy_d2h = MemCpy; + params->interface->memory_copy_p2p = MemCpyP2P; + params->interface->async_memory_copy_h2d = AsyncMemCpy; + params->interface->async_memory_copy_d2d = AsyncMemCpy; + params->interface->async_memory_copy_d2h = AsyncMemCpy; + params->interface->async_memory_copy_p2p = AsyncMemCpyP2P; + params->interface->device_memory_allocate = Allocate; + params->interface->host_memory_allocate = Allocate; + params->interface->unified_memory_allocate = Allocate; + params->interface->device_memory_deallocate = Deallocate; + params->interface->host_memory_deallocate = Deallocate; + params->interface->unified_memory_deallocate = Deallocate; + + params->interface->get_device_count = GetDevicesCount; + params->interface->get_device_list = GetDevicesList; + params->interface->device_memory_stats = DeviceMemStats; + params->interface->device_min_chunk_size = DeviceMinChunkSize; +} diff --git a/python/paddle/fluid/tests/custom_runtime/custom_cpu_setup.py b/python/paddle/fluid/tests/custom_runtime/custom_cpu_setup.py new file mode 100644 index 0000000000000..82accb2ad00df --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/custom_cpu_setup.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import site +from paddle.fluid import core +from distutils.sysconfig import get_python_lib +from distutils.core import setup, Extension +from setuptools.command.build_ext import build_ext + + +# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes +# Avoid a gcc warning below: +# cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid +# for C/ObjC but not for C++ +class BuildExt(build_ext): + + def build_extensions(self): + if '-Wstrict-prototypes' in self.compiler.compiler_so: + self.compiler.compiler_so.remove('-Wstrict-prototypes') + super(BuildExt, self).build_extensions() + + +# cc flags +paddle_extra_compile_args = [ + '-std=c++14', + '-shared', + '-fPIC', + '-Wno-parentheses', + '-DPADDLE_WITH_CUSTOM_KERNEL', + '-DPADDLE_WITH_CUSTOM_DEVICE', +] +if core.is_compiled_with_npu(): + paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0'] + +# include path +site_packages_path = site.getsitepackages() +include_dirs = list( + map(lambda path: os.path.join(path, 'paddle', 'include'), + site_packages_path)) + +# include path third_party +compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'], + 'build/third_party') +include_dirs += [ + os.path.join(compile_third_party_path, 'boost/src/extern_boost'), # boost + os.path.join(compile_third_party_path, 'install/gflags/include'), # gflags + os.path.join(compile_third_party_path, 'install/glog/include'), # glog +] + +# libs path +library_dirs = list( + map(lambda path: os.path.join(path, 'paddle', 'fluid'), site_packages_path)) + +# libs +libs = [':core_avx.so'] +if not core.has_avx_core and core.has_noavx_core: + libs = [':core_noavx.so'] + +custom_cpu_plugin_so = Extension('custom_cpu_runtime', + sources=['custom_cpu_runtime.cc'], + include_dirs=include_dirs, + library_dirs=library_dirs, + libraries=libs, + extra_compile_args=paddle_extra_compile_args) + +setup(name='custom_kernel_dot', + version='1.0', + description='custom kernel fot compiling', + cmdclass={'build_ext': BuildExt}, + ext_modules=[custom_cpu_plugin_so]) diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_device_data_loader.py b/python/paddle/fluid/tests/custom_runtime/test_custom_device_data_loader.py new file mode 100644 index 0000000000000..775c3f487d596 --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/test_custom_device_data_loader.py @@ -0,0 +1,66 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import site +import unittest +import numpy as np + + +class TestCustomDeviceDataLoader(unittest.TestCase): + + def setUp(self): + # compile so and set to current path + cur_dir = os.path.dirname(os.path.abspath(__file__)) + + # --inplace to place output so file to current dir + cmd = 'cd {} && {} custom_cpu_setup.py build_ext --inplace'.format( + cur_dir, sys.executable) + os.system(cmd) + + # set environment for loading and registering compiled custom kernels + # only valid in current process + os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir + + def test_custom_device_dataloader(self): + import paddle + + paddle.set_device('custom_cpu') + dataset = paddle.vision.datasets.MNIST( + mode='test', + transform=paddle.vision.transforms.Compose([ + paddle.vision.transforms.CenterCrop(20), + paddle.vision.transforms.RandomResizedCrop(14), + paddle.vision.transforms.Normalize(), + paddle.vision.transforms.ToTensor() + ])) + loader = paddle.io.DataLoader(dataset, + batch_size=32, + num_workers=1, + shuffle=True) + for image, label in loader: + self.assertTrue(image.place.is_custom_place()) + self.assertTrue(label.place.is_custom_place()) + break + + def tearDown(self): + del os.environ['CUSTOM_DEVICE_ROOT'] + + +if __name__ == '__main__': + if os.name == 'nt' or sys.platform.startswith('darwin'): + # only support Linux now + exit() + unittest.main() From 953024ff4fb81543825df952670c16109fb2bf90 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Wed, 6 Jul 2022 09:42:59 +0800 Subject: [PATCH 068/250] fix stack_op_plugin (#44045) --- paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu index 4ef160d2e04b8..e77f12769c0f3 100644 --- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu @@ -152,8 +152,8 @@ __global__ void StackKernel(const T* const* input, T* output, int num_stack, int base_unit) { - int stack_id = blockIdx.x; - int lead_id = blockIdx.y; + int stack_id = blockIdx.y; + int lead_id = blockIdx.x; for (int i = threadIdx.x; i < base_unit; i += blockDim.x) { output[lead_id * num_stack * base_unit + stack_id * base_unit + i] = @@ -201,7 +201,8 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, stream); const int num_stacks = out_dims.d[axis_]; - dim3 num_blocks(num_stacks, lead_unit); + // lead_unit may be very large, so make it be blockIdx.x + dim3 num_blocks(lead_unit, num_stacks); const int num_threads = 256; auto infer_type = input_desc[0].type; From 07b68eb34b41c21df900f05c9003f4224b52f441 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Wed, 6 Jul 2022 10:00:44 +0800 Subject: [PATCH 069/250] [gpups]fix sparse config work (#44090) --- .../framework/fleet/heter_ps/heter_comm_inl.h | 18 +++++++--- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 33 ++++++++----------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index ace533cb0c745..a7333cd01c6ec 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -426,16 +426,26 @@ int HeterComm::get_index_by_devid(int devid) { template void HeterComm::set_sparse_sgd( const OptimizerConfig& optimizer_config) { - for (auto& table : tables_) { - table->set_sparse_sgd(optimizer_config); + for (int i = 0; i < resource_->total_device(); ++i) { + AnyDeviceGuard guard(resource_->dev_id(i)); + if (!multi_mf_dim_) { + tables_[i]->set_sparse_sgd(optimizer_config); + } else { + ptr_tables_[i]->set_sparse_sgd(optimizer_config); + } } } template void HeterComm::set_embedx_sgd( const OptimizerConfig& optimizer_config) { - for (auto& table : tables_) { - table->set_embedx_sgd(optimizer_config); + for (int i = 0; i < resource_->total_device(); ++i) { + AnyDeviceGuard guard(resource_->dev_id(i)); + if (!multi_mf_dim_) { + tables_[i]->set_embedx_sgd(optimizer_config); + } else { + ptr_tables_[i]->set_embedx_sgd(optimizer_config); + } } } diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index fae30a45d2e5b..65f86acce9151 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -323,26 +323,19 @@ class PSGPUWrapper { float mf_max_bound = (config.find("mf_max_bound") == config.end()) ? 1.0 : config["mf_max_bound"]; - for (size_t i = 0; i < heter_devices_.size(); i++) { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(heter_devices_[i])); -#elif defined(PADDLE_WITH_XPU_KP) - PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(heter_devices_[i])); -#endif - this->SetSparseSGD(nonclk_coeff, - clk_coeff, - min_bound, - max_bound, - learning_rate, - initial_g2sum, - initial_range); - this->SetEmbedxSGD(mf_create_thresholds, - mf_learning_rate, - mf_initial_g2sum, - mf_initial_range, - mf_min_bound, - mf_max_bound); - } + this->SetSparseSGD(nonclk_coeff, + clk_coeff, + min_bound, + max_bound, + learning_rate, + initial_g2sum, + initial_range); + this->SetEmbedxSGD(mf_create_thresholds, + mf_learning_rate, + mf_initial_g2sum, + mf_initial_range, + mf_min_bound, + mf_max_bound); } void SetDate(int year, int month, int day) { From 24d07b73ded95d6f7ef5495ec3c40d4ad1181ffd Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 6 Jul 2022 10:12:04 +0800 Subject: [PATCH 070/250] generate map of extra attrs for ops (#44106) --- .gitignore | 1 + paddle/phi/api/lib/CMakeLists.txt | 15 +++ paddle/phi/api/yaml/api_compat.yaml | 9 ++ paddle/phi/api/yaml/generator/generate_op.py | 2 + .../api/yaml/generator/ops_extra_info_gen.py | 110 ++++++++++++++++++ 5 files changed, 137 insertions(+) create mode 100644 paddle/phi/api/yaml/generator/ops_extra_info_gen.py diff --git a/.gitignore b/.gitignore index 25ecd77e25de9..2c486ec96f106 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ paddle/fluid/API_PR.spec paddle/fluid/eager/api/generated/* paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_PR.spec +paddle/fluid/operators/ops_extra_info.h paddle/phi/api/backward/backward_api.h paddle/phi/api/backward/sparse_bw_api.h paddle/phi/api/include/api.h diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 2a1a6b4e78bd5..f50323cef216c 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -94,6 +94,14 @@ set(wrapped_infermeta_header_file set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc) +# op extra info file +set(ops_extra_info_gen_file + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/ops_extra_info_gen.py) +set(api_compat_yaml_file + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/api_compat.yaml) +set(ops_extra_info_file + ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/ops_extra_info.h) + if(NOT PYTHONINTERP_FOUND) find_package(PythonInterp REQUIRED) endif() @@ -211,6 +219,13 @@ else() message("remove ${generated_argument_mapping_path}") endif() +# generate ops extra info +execute_process( + COMMAND + ${PYTHON_EXECUTABLE} ${ops_extra_info_gen_file} --api_compat_yaml_path + ${api_compat_yaml_file} --ops_extra_info_path ${ops_extra_info_file}) +message("generate ${ops_extra_info_file}") + # generate forward api add_custom_command( OUTPUT ${api_header_file} ${api_source_file} diff --git a/paddle/phi/api/yaml/api_compat.yaml b/paddle/phi/api/yaml/api_compat.yaml index 17f1d545057c0..987876d703928 100644 --- a/paddle/phi/api/yaml/api_compat.yaml +++ b/paddle/phi/api/yaml/api_compat.yaml @@ -23,3 +23,12 @@ x : Input outputs : out : Out + +- api : conv2d + extra : + attrs : [bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, + bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false, + str fuse_activation = "", bool fuse_alpha = false, bool fuse_beta = false, bool use_addto = false, + bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f, + float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false, + int workspace_size_MB = 512, bool exhaustive_search = false] diff --git a/paddle/phi/api/yaml/generator/generate_op.py b/paddle/phi/api/yaml/generator/generate_op.py index 627051365c3f7..e70042fb9d033 100644 --- a/paddle/phi/api/yaml/generator/generate_op.py +++ b/paddle/phi/api/yaml/generator/generate_op.py @@ -76,6 +76,8 @@ def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path, api_args_map = yaml.safe_load(f) # replace args name for OpMaker for api_args in api_args_map: + if api_args['api'] not in forward_api_dict: + continue forward_api_item = forward_api_dict[api_args['api']] has_backward = True if forward_api_item['backward'] else False if has_backward: diff --git a/paddle/phi/api/yaml/generator/ops_extra_info_gen.py b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py new file mode 100644 index 0000000000000..ef5afbf595b96 --- /dev/null +++ b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py @@ -0,0 +1,110 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import yaml +import re +import argparse + + +def map_code_template(attrs_str): + return f""" +#include "paddle/fluid/framework/attribute.h" + +namespace paddle {{ +const static std::unordered_map extra_attrs_map = {{ +{attrs_str} +}}; + +}} // namespace paddle + +""" + + +ATTR_TYPE_STRING_MAP = { + 'bool': 'bool', + 'int': 'int', + 'int64_t': 'int64_t', + 'float': 'float', + 'double': 'double', + 'str': 'std::string', + 'int[]': 'std::vector', + 'int64_t[]': 'std::vector', + 'float[]': 'std::vector', + 'double[]': 'std::vector', + 'str[]': 'std::vector' +} + + +def parse_attr(attr_str): + result = re.search( + r"(?P[a-z[\]]+)\s+(?P[a-zA-Z0-9_]+)\s*=\s*(?P\S+)", + attr_str) + return ATTR_TYPE_STRING_MAP[result.group('attr_type')], result.group( + 'name'), result.group('default_val') + + +def generate_extra_info(api_compat_yaml_path, ops_extra_info_path): + compat_apis = [] + with open(api_compat_yaml_path, 'rt') as f: + compat_apis = yaml.safe_load(f) + + extra_map_str_list = [] + + for api_compat_args in compat_apis: + if 'extra' in api_compat_args: + extra_args_map = api_compat_args['extra'] + # TODO(chenweihang): add inputs and outputs + if 'attrs' in extra_args_map: + attr_map_list = [] + for attr in extra_args_map['attrs']: + attr_type, attr_name, default_val = parse_attr(attr) + if attr_type.startswith("std::vector"): + attr_map_list.append( + f"{{\"{attr_name}\", {attr_type}{default_val}}}") + else: + attr_map_list.append( + f"{{\"{attr_name}\", {attr_type}{{{default_val}}}}}" + ) + api_extra_attr_map = ", ".join(attr_map_list) + extra_map_str_list.append( + f"{{\"{api_compat_args['api']}\", {{ {api_extra_attr_map} }}}}" + ) + + ops_extra_info_file = open(ops_extra_info_path, 'w') + ops_extra_info_file.write(map_code_template(",\n".join(extra_map_str_list))) + ops_extra_info_file.close() + + +def main(): + parser = argparse.ArgumentParser( + description='Generate PaddlePaddle Extra Param Info for Op') + parser.add_argument('--api_compat_yaml_path', + help='path to api compat yaml file', + default='paddle/phi/api/yaml/api_compat.yaml') + + parser.add_argument('--ops_extra_info_path', + help='output of generated extra_prama_info code file', + default='paddle/fluid/operators/ops_extra_info.h') + + options = parser.parse_args() + + api_compat_yaml_path = options.api_compat_yaml_path + ops_extra_info_path = options.ops_extra_info_path + + generate_extra_info(api_compat_yaml_path, ops_extra_info_path) + + +if __name__ == '__main__': + main() From 4c269ccb5965c35c3285a1a79db042d6aabc6182 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Wed, 6 Jul 2022 11:29:45 +0800 Subject: [PATCH 071/250] [Paddle Inference] Add conv_elementwise_act. (#43871) * conv_fusion --- .../ir/conv_elementwise_add2_act_fuse_pass.cc | 18 ++++++++++++++++++ .../ir/conv_elementwise_add_act_fuse_pass.cc | 18 ++++++++++++++++++ .../framework/ir/graph_pattern_detector.cc | 3 ++- paddle/fluid/operators/fused/conv_fusion_op.cu | 8 +++++--- 4 files changed, 43 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc index ff86bdb8fa86f..6d9611ebd1393 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -105,6 +105,22 @@ ConvElementwiseAdd2ActFusePass::ConvElementwiseAdd2ActFusePass() { .AddOutput("Out") .IsTensor() .End(); + + AddOpCompat(OpCompat("sigmoid")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("tanh")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); } void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const { @@ -188,4 +204,6 @@ REGISTER_PASS_CAPABILITY(conv_elementwise_add2_act_fuse_pass) .LE("conv2d", 1) .LE("elementwise_add", 1) .EQ("relu", 0) + .EQ("sigmoid", 0) + .EQ("tanh", 0) .EQ("identity", 0)); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc index f67e83bc10171..47e2c5e380bcb 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -102,6 +102,22 @@ ConvElementwiseAddActFusePass::ConvElementwiseAddActFusePass() { .AddOutput("Out") .IsTensor() .End(); + + AddOpCompat(OpCompat("sigmoid")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("tanh")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); } void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { @@ -170,4 +186,6 @@ REGISTER_PASS_CAPABILITY(conv_elementwise_add_act_fuse_pass) .LE("conv2d", 1) .LE("elementwise_add", 1) .EQ("relu", 0) + .EQ("sigmoid", 0) + .EQ("tanh", 0) .EQ("identity", 0)); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 154df498e7d13..f0949cb9dfbd2 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2324,7 +2324,8 @@ PDNode *patterns::PriorBox::operator()() { return boxes_var; } -std::unordered_set conv_act_set({"identity", "relu"}); +std::unordered_set conv_act_set( + {"identity", "relu", "sigmoid", "tanh"}); PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { conv_in->AsInput(); diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 5e96ca140274d..2ee63c9364221 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -544,9 +544,11 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { namespace ops = paddle::operators; #if CUDNN_VERSION >= 7100 -REGISTER_OP_CUDA_KERNEL(conv2d_fusion, - ops::CUDNNConvFusionOpKernel, - ops::CUDNNConvFusionOpKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d_fusion, + ops::CUDNNConvFusionOpKernel, + ops::CUDNNConvFusionOpKernel, + ops::CUDNNConvFusionOpKernel); #endif #ifdef PADDLE_WITH_HIP REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel); From 502062da65fa940ff483870b02faeeace3e224a4 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Wed, 6 Jul 2022 11:37:38 +0800 Subject: [PATCH 072/250] minor fix VLOG for xpu. test=kunlun. (#44099) --- paddle/fluid/memory/allocation/naive_best_fit_allocator.cc | 2 +- paddle/phi/backends/xpu/xpu_context.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 57c5941d5227d..4553c80e74c59 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -180,7 +180,7 @@ void Free(const platform::XPUPlace &place, void *p, size_t size) { #ifdef PADDLE_WITH_XPU - VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + VLOG(10) << "Free " << size << " bytes on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); platform::XPUDeviceGuard gurad(place.device); diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc index dbff88c0a2709..e73aa30c8d85b 100644 --- a/paddle/phi/backends/xpu/xpu_context.cc +++ b/paddle/phi/backends/xpu/xpu_context.cc @@ -45,8 +45,8 @@ struct XPUContext::Impl { } if (l3ptrs[place_.GetDeviceId()] != nullptr) { context_->_l3_mgr.set(l3ptrs[place_.GetDeviceId()], l3_size); - VLOG(3) << "xpu place " << place_.GetDeviceId() << " set l3 size " - << l3_size; + VLOG(3) << "xpu place " << static_cast(place_.GetDeviceId()) + << " set l3 size " << l3_size; } break; } From 54a9daf2a0be7dd2a973c3a319a2dcfd0cf3baa2 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Wed, 6 Jul 2022 14:54:23 +0800 Subject: [PATCH 073/250] [Eager] Menual fused feed forward (#43994) * fused_gate_attention manual code in eager * Menual fused_feedforward in eager * fix test case --- .../manual/fluid_manual/dygraph_forward_api.h | 25 ++ .../fluid_manual/forwards/CMakeLists.txt | 9 +- .../forwards/fused_feedforward_fwd_func.cc | 403 ++++++++++++++++++ .../manual/fluid_manual/nodes/CMakeLists.txt | 7 +- .../nodes/fused_feedforward_node.cc | 208 +++++++++ .../api/manual/fluid_manual/nodes/nodes.h | 155 +++++++ .../auto_code_generator/eager_generator.cc | 2 +- .../unittests/test_fused_feedforward_op.py | 4 +- 8 files changed, 807 insertions(+), 6 deletions(-) create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc diff --git a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h index 3715544b923aa..397e549e61473 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h +++ b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h @@ -42,3 +42,28 @@ fused_gate_attention_dygraph_function( const paddle::experimental::Tensor& OutLinearWeight, const paddle::experimental::Tensor& OutLinearBias, const paddle::framework::AttributeMap& attr_map); + +std::tuple +fused_feedforward_dygraph_function( + const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Dropout1Seed, + const paddle::experimental::Tensor& Dropout2Seed, + const paddle::experimental::Tensor& Linear1Weight, + const paddle::experimental::Tensor& Linear1Bias, + const paddle::experimental::Tensor& Linear2Weight, + const paddle::experimental::Tensor& Linear2Bias, + const paddle::experimental::Tensor& Ln1Scale, + const paddle::experimental::Tensor& Ln1Bias, + const paddle::experimental::Tensor& Ln2Scale, + const paddle::experimental::Tensor& Ln2Bias, + const paddle::framework::AttributeMap& attr_map); diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt index 2a7d72eb7cabd..305df1c92c6e1 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt @@ -5,6 +5,13 @@ cc_library( add_dependencies(fused_gate_attention_fwd_func eager_codegen) +cc_library( + fused_feedforward_fwd_func + SRCS fused_feedforward_fwd_func.cc + DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + +add_dependencies(fused_feedforward_fwd_func eager_codegen) + set(fluid_manual_functions - fused_gate_attention_fwd_func + fused_gate_attention_fwd_func fused_feedforward_fwd_func PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc new file mode 100644 index 0000000000000..e246649314b52 --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc @@ -0,0 +1,403 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/amp_auto_cast.h" +#include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" + +#pragma GCC diagnostic ignored "-Wunused-variable" + +std::tuple +fused_feedforward_dygraph_function( + const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Dropout1Seed, + const paddle::experimental::Tensor& Dropout2Seed, + const paddle::experimental::Tensor& Linear1Weight, + const paddle::experimental::Tensor& Linear1Bias, + const paddle::experimental::Tensor& Linear2Weight, + const paddle::experimental::Tensor& Linear2Bias, + const paddle::experimental::Tensor& Ln1Scale, + const paddle::experimental::Tensor& Ln1Bias, + const paddle::experimental::Tensor& Ln2Scale, + const paddle::experimental::Tensor& Ln2Bias, + const paddle::framework::AttributeMap& attr_map) { + paddle::platform::RecordEvent dygraph_entrance_record_event( + "fused_feedforward dygraph", + paddle::platform::TracerEventType::Operator, + 1); + VLOG(3) << "Running Eager Forward Op: fused_feedforward"; + // Dygraph Forward Pass + + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + VLOG(5) << "Check and Prepare For AMP"; + + paddle::small_vector, + egr::kSlotSmallVectorSize> + amp_tensors_vector = {{X}, {Linear1Weight}, {Linear2Weight}}; + if (Dropout1Seed.initialized()) + amp_tensors_vector.push_back({Dropout1Seed}); + if (Dropout2Seed.initialized()) + amp_tensors_vector.push_back({Dropout2Seed}); + if (Linear1Bias.initialized()) amp_tensors_vector.push_back({Linear1Bias}); + if (Linear2Bias.initialized()) amp_tensors_vector.push_back({Linear2Bias}); + if (Ln1Scale.initialized()) amp_tensors_vector.push_back({Ln1Scale}); + if (Ln1Bias.initialized()) amp_tensors_vector.push_back({Ln1Bias}); + if (Ln2Scale.initialized()) amp_tensors_vector.push_back({Ln2Scale}); + if (Ln2Bias.initialized()) amp_tensors_vector.push_back({Ln2Bias}); + + auto amp_dst_dtype = + egr::GetAmpDestDtype("fused_feedforward", amp_tensors_vector); + + auto NEW_X = egr::AmpAutoCast("X", X, amp_dst_dtype, "fused_feedforward"); + auto NEW_Linear1Weight = egr::AmpAutoCast( + "Linear1Weight", Linear1Weight, amp_dst_dtype, "fused_feedforward"); + auto NEW_Linear2Weight = egr::AmpAutoCast( + "Linear2Weight", Linear2Weight, amp_dst_dtype, "fused_feedforward"); + auto NEW_Dropout1Seed = + ((Dropout1Seed.initialized()) ? egr::AmpAutoCast("Dropout1Seed", + Dropout1Seed, + amp_dst_dtype, + "fused_feedforward") + : Dropout1Seed); + auto NEW_Dropout2Seed = + ((Dropout2Seed.initialized()) ? egr::AmpAutoCast("Dropout2Seed", + Dropout2Seed, + amp_dst_dtype, + "fused_feedforward") + : Dropout2Seed); + auto NEW_Linear1Bias = + ((Linear1Bias.initialized()) ? egr::AmpAutoCast("Linear1Bias", + Linear1Bias, + amp_dst_dtype, + "fused_feedforward") + : Linear1Bias); + auto NEW_Linear2Bias = + ((Linear2Bias.initialized()) ? egr::AmpAutoCast("Linear2Bias", + Linear2Bias, + amp_dst_dtype, + "fused_feedforward") + : Linear2Bias); + auto NEW_Ln1Scale = + ((Ln1Scale.initialized()) + ? egr::AmpAutoCast( + "Ln1Scale", Ln1Scale, amp_dst_dtype, "fused_feedforward") + : Ln1Scale); + auto NEW_Ln1Bias = + ((Ln1Bias.initialized()) + ? egr::AmpAutoCast( + "Ln1Bias", Ln1Bias, amp_dst_dtype, "fused_feedforward") + : Ln1Bias); + auto NEW_Ln2Scale = + ((Ln2Scale.initialized()) + ? egr::AmpAutoCast( + "Ln2Scale", Ln2Scale, amp_dst_dtype, "fused_feedforward") + : Ln2Scale); + auto NEW_Ln2Bias = + ((Ln2Bias.initialized()) + ? egr::AmpAutoCast( + "Ln2Bias", Ln2Bias, amp_dst_dtype, "fused_feedforward") + : Ln2Bias); + + { + paddle::imperative::AutoCastGuard guard( + egr::Controller::Instance().GetCurrentTracer(), + paddle::imperative::AmpLevel::O0); + return fused_feedforward_dygraph_function(NEW_X, + NEW_Dropout1Seed, + NEW_Dropout2Seed, + NEW_Linear1Weight, + NEW_Linear1Bias, + NEW_Linear2Weight, + NEW_Linear2Bias, + NEW_Ln1Scale, + NEW_Ln1Bias, + NEW_Ln2Scale, + NEW_Ln2Bias, + attr_map); + } + } + + std::map>> ins = + {{"X", egr::EagerUtils::TrySyncToVars(X)}, + {"Linear1Weight", egr::EagerUtils::TrySyncToVars(Linear1Weight)}, + {"Linear2Weight", egr::EagerUtils::TrySyncToVars(Linear2Weight)}}; + if (Dropout1Seed.initialized()) + ins["Dropout1Seed"] = egr::EagerUtils::TrySyncToVars(Dropout1Seed); + if (Dropout2Seed.initialized()) + ins["Dropout2Seed"] = egr::EagerUtils::TrySyncToVars(Dropout2Seed); + if (Linear1Bias.initialized()) + ins["Linear1Bias"] = egr::EagerUtils::TrySyncToVars(Linear1Bias); + if (Linear2Bias.initialized()) + ins["Linear2Bias"] = egr::EagerUtils::TrySyncToVars(Linear2Bias); + if (Ln1Scale.initialized()) + ins["Ln1Scale"] = egr::EagerUtils::TrySyncToVars(Ln1Scale); + if (Ln1Bias.initialized()) + ins["Ln1Bias"] = egr::EagerUtils::TrySyncToVars(Ln1Bias); + if (Ln2Scale.initialized()) + ins["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale); + if (Ln2Bias.initialized()) + ins["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias); + + std::map>> outs = + {{"Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Dropout1Mask", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Dropout2Mask", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln1Mean", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln1Variance", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln2Mean", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln2Variance", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Linear1Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln1Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Dropout1Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Dropout2Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}}; + + // Prepare Autograd Meta + egr::AutogradMeta* p_autograd_X = egr::EagerUtils::nullable_autograd_meta(X); + egr::AutogradMeta* p_autograd_Dropout1Seed = + egr::EagerUtils::nullable_autograd_meta(Dropout1Seed); + egr::AutogradMeta* p_autograd_Dropout2Seed = + egr::EagerUtils::nullable_autograd_meta(Dropout2Seed); + egr::AutogradMeta* p_autograd_Linear1Weight = + egr::EagerUtils::nullable_autograd_meta(Linear1Weight); + egr::AutogradMeta* p_autograd_Linear1Bias = + egr::EagerUtils::nullable_autograd_meta(Linear1Bias); + egr::AutogradMeta* p_autograd_Linear2Weight = + egr::EagerUtils::nullable_autograd_meta(Linear2Weight); + egr::AutogradMeta* p_autograd_Linear2Bias = + egr::EagerUtils::nullable_autograd_meta(Linear2Bias); + egr::AutogradMeta* p_autograd_Ln1Scale = + egr::EagerUtils::nullable_autograd_meta(Ln1Scale); + egr::AutogradMeta* p_autograd_Ln1Bias = + egr::EagerUtils::nullable_autograd_meta(Ln1Bias); + egr::AutogradMeta* p_autograd_Ln2Scale = + egr::EagerUtils::nullable_autograd_meta(Ln2Scale); + egr::AutogradMeta* p_autograd_Ln2Bias = + egr::EagerUtils::nullable_autograd_meta(Ln2Bias); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + + bool require_any_grad = + egr::EagerUtils::ComputeRequireGrad(trace_backward, + p_autograd_X, + p_autograd_Dropout1Seed, + p_autograd_Dropout2Seed, + p_autograd_Linear1Weight, + p_autograd_Linear1Bias, + p_autograd_Linear2Weight, + p_autograd_Linear2Bias, + p_autograd_Ln1Scale, + p_autograd_Ln1Bias, + p_autograd_Ln2Scale, + p_autograd_Ln2Bias); + + paddle::framework::AttributeMap attrs = attr_map; + paddle::framework::AttributeMap default_attrs; + egr::Controller::Instance().GetCurrentTracer()->TraceOp( + "fused_feedforward", + ins, + outs, + attrs, + egr::Controller::Instance().GetExpectedPlace(), + &default_attrs, + true, + {}); + + paddle::experimental::Tensor Out; + egr::EagerUtils::GetOutput(outs["Out"][0], &Out); + paddle::experimental::Tensor Dropout1Mask; + egr::EagerUtils::GetOutput(outs["Dropout1Mask"][0], &Dropout1Mask); + paddle::experimental::Tensor Dropout2Mask; + egr::EagerUtils::GetOutput(outs["Dropout2Mask"][0], &Dropout2Mask); + paddle::experimental::Tensor Ln1Mean; + egr::EagerUtils::GetOutput(outs["Ln1Mean"][0], &Ln1Mean); + paddle::experimental::Tensor Ln1Variance; + egr::EagerUtils::GetOutput(outs["Ln1Variance"][0], &Ln1Variance); + paddle::experimental::Tensor Ln2Mean; + egr::EagerUtils::GetOutput(outs["Ln2Mean"][0], &Ln2Mean); + paddle::experimental::Tensor Ln2Variance; + egr::EagerUtils::GetOutput(outs["Ln2Variance"][0], &Ln2Variance); + paddle::experimental::Tensor Linear1Out; + egr::EagerUtils::GetOutput(outs["Linear1Out"][0], &Linear1Out); + paddle::experimental::Tensor Ln1Out; + egr::EagerUtils::GetOutput(outs["Ln1Out"][0], &Ln1Out); + paddle::experimental::Tensor Dropout1Out; + egr::EagerUtils::GetOutput(outs["Dropout1Out"][0], &Dropout1Out); + paddle::experimental::Tensor Dropout2Out; + egr::EagerUtils::GetOutput(outs["Dropout2Out"][0], &Dropout2Out); + + { + paddle::platform::RecordEvent node_creation_record_event( + "fused_feedforward node_creation", + paddle::platform::TracerEventType::Operator, + 1); + egr::AutogradMeta* p_autograd_Out = egr::EagerUtils::autograd_meta(&Out); + egr::AutogradMeta* p_autograd_Dropout1Mask = + egr::EagerUtils::autograd_meta(&Dropout1Mask); + egr::AutogradMeta* p_autograd_Dropout2Mask = + egr::EagerUtils::autograd_meta(&Dropout2Mask); + egr::AutogradMeta* p_autograd_Ln1Mean = + egr::EagerUtils::autograd_meta(&Ln1Mean); + egr::AutogradMeta* p_autograd_Ln1Variance = + egr::EagerUtils::autograd_meta(&Ln1Variance); + egr::AutogradMeta* p_autograd_Ln2Mean = + egr::EagerUtils::autograd_meta(&Ln2Mean); + egr::AutogradMeta* p_autograd_Ln2Variance = + egr::EagerUtils::autograd_meta(&Ln2Variance); + egr::AutogradMeta* p_autograd_Linear1Out = + egr::EagerUtils::autograd_meta(&Linear1Out); + egr::AutogradMeta* p_autograd_Ln1Out = + egr::EagerUtils::autograd_meta(&Ln1Out); + egr::AutogradMeta* p_autograd_Dropout1Out = + egr::EagerUtils::autograd_meta(&Dropout1Out); + egr::AutogradMeta* p_autograd_Dropout2Out = + egr::EagerUtils::autograd_meta(&Dropout2Out); + if (require_any_grad) { + VLOG(6) << " Construct Grad for fused_feedforward "; + egr::EagerUtils::PassStopGradient(false, + p_autograd_Out, + p_autograd_Dropout1Mask, + p_autograd_Dropout2Mask, + p_autograd_Ln1Mean, + p_autograd_Ln1Variance, + p_autograd_Ln2Mean, + p_autograd_Ln2Variance, + p_autograd_Linear1Out, + p_autograd_Ln1Out, + p_autograd_Dropout1Out, + p_autograd_Dropout2Out); + // Create GradOpNode + auto grad_node = std::shared_ptr( + new fused_feedforwardGradNodeCompat(11, 11)); + + bool pre_layer_norm = false; + if (attrs.count("pre_layer_norm")) { + pre_layer_norm = BOOST_GET_CONST(bool, attrs.at("pre_layer_norm")); + } + + // Set Attributes + grad_node->SetAttrMap(std::move(attrs)); + grad_node->SetDefaultAttrMap(std::move(default_attrs)); + + grad_node->SetTensorWrapperX(X); + grad_node->SetTensorWrapperLinear1Weight(Linear1Weight); + grad_node->SetTensorWrapperLinear1Bias(Linear1Bias); + grad_node->SetTensorWrapperLinear2Weight(Linear2Weight); + grad_node->SetTensorWrapperDropout1Mask(Dropout1Mask); + grad_node->SetTensorWrapperDropout2Mask(Dropout2Mask); + grad_node->SetTensorWrapperLinear1Out(Linear1Out); + grad_node->SetTensorWrapperDropout1Out(Dropout1Out); + grad_node->SetTensorWrapperDropout2Out(Dropout2Out); + + grad_node->SetGradOutMeta(X, 0); + grad_node->SetGradOutMeta(Linear1Weight, 3); + grad_node->SetGradOutMeta(Linear1Bias, 4); + grad_node->SetGradOutMeta(Linear2Weight, 5); + + if (pre_layer_norm) { + grad_node->SetTensorWrapperLn1Scale(Ln1Scale); + grad_node->SetTensorWrapperLn1Bias(Ln1Bias); + grad_node->SetTensorWrapperLn1Out(Ln1Out); + grad_node->SetTensorWrapperLn1Mean(Ln1Mean); + grad_node->SetTensorWrapperLn1Variance(Ln1Variance); + grad_node->SetGradOutMeta(Ln1Scale, 7); + grad_node->SetGradOutMeta(Ln1Bias, 8); + } else { + grad_node->SetTensorWrapperLn2Scale(Ln2Scale); + grad_node->SetGradOutMeta(Ln2Scale, 9); + grad_node->SetTensorWrapperLn2Bias(Ln2Bias); + grad_node->SetGradOutMeta(Ln2Bias, 10); + grad_node->SetTensorWrapperLn2Mean(Ln2Mean); + grad_node->SetTensorWrapperLn2Variance(Ln2Variance); + } + + if (Linear2Bias.initialized()) { + grad_node->SetTensorWrapperLinear2Bias(Linear2Bias); + grad_node->SetGradOutMeta(Linear2Bias, 6); + } + + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 0); + egr::EagerUtils::SetHistory(p_autograd_Out, grad_node); + grad_node->SetGradInMeta(Out, 0); + egr::EagerUtils::CheckAndRetainGrad(Out); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout1Mask, 1); + grad_node->SetGradInMeta(Dropout1Mask, 1); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout2Mask, 2); + grad_node->SetGradInMeta(Dropout2Mask, 2); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Mean, 3); + grad_node->SetGradInMeta(Ln1Mean, 3); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Variance, 4); + grad_node->SetGradInMeta(Ln1Variance, 4); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Mean, 5); + grad_node->SetGradInMeta(Ln2Mean, 5); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Variance, 6); + grad_node->SetGradInMeta(Ln2Variance, 6); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Linear1Out, 7); + grad_node->SetGradInMeta(Linear1Out, 7); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Out, 8); + grad_node->SetGradInMeta(Ln1Out, 8); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout1Out, 9); + grad_node->SetGradInMeta(Dropout1Out, 9); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout2Out, 10); + grad_node->SetGradInMeta(Dropout2Out, 10); + } + } + + return std::make_tuple(Out, + Dropout1Mask, + Dropout2Mask, + Ln1Mean, + Ln1Variance, + Ln2Mean, + Ln2Variance, + Linear1Out, + Ln1Out, + Dropout1Out, + Dropout2Out); +} diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt index fb5e129223544..4eaa43a4b51c6 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt @@ -3,6 +3,11 @@ cc_library( SRCS fused_gate_attention_node.cc DEPS ${eager_deps} ${fluid_deps}) +cc_library( + fused_feedforward_node + SRCS fused_feedforward_node.cc + DEPS ${eager_deps} ${fluid_deps}) + set(fluid_manual_nodes - fused_gate_attention_node + fused_gate_attention_node fused_feedforward_node PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc new file mode 100644 index 0000000000000..5228cb3657825 --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc @@ -0,0 +1,208 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/phi/api/all.h" + +paddle::small_vector, + egr::kSlotSmallVectorSize> +fused_feedforwardGradNodeCompat::operator()( + paddle::small_vector, + egr::kSlotSmallVectorSize>& grads, + bool create_graph, + bool is_new_grad) { + VLOG(3) << "Running Eager Backward Node: fused_feedforwardGradNodeCompat"; + const auto& out_metas = OutputMeta(); + paddle::small_vector, + egr::kSlotSmallVectorSize> + outputs(11); + + paddle::small_vector, + egr::kSlotSmallVectorSize> + hooked_grads0 = + fused_feedforwardGradNodeCompat::ApplyGradientHooks(grads); + + bool pre_layer_norm = false; + if (attr_map_.count("pre_layer_norm")) { + pre_layer_norm = BOOST_GET_CONST(bool, attr_map_.at("pre_layer_norm")); + } + + std::map>> ins0 = + {{"Dropout1Mask", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Dropout1Mask_))}, + {"Dropout1Out", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Dropout1Out_))}, + {"Dropout2Mask", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Dropout2Mask_))}, + {"Dropout2Out", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Dropout2Out_))}, + {"Linear1Out", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Out_))}, + {"Linear1Weight", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Weight_))}, + {"Linear2Weight", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Linear2Weight_))}, + {"Out@GRAD", egr::EagerUtils::TrySyncToVars(hooked_grads0[0])}, + {"X", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->X_))}}; + + std::map>> outs0; + + auto Linear1Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Bias_); + if (Linear1Bias.defined()) + ins0["Linear1Bias"] = egr::EagerUtils::TrySyncToVars(Linear1Bias); + + if ((!out_metas[3].empty()) && (!(out_metas[3][0].IsStopGradient()))) { + outs0.insert({"Linear1Weight@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if ((!out_metas[5].empty()) && (!(out_metas[5][0].IsStopGradient()))) { + outs0.insert({"Linear2Weight@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) { + outs0.insert({"X@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if (Linear1Bias.defined() && (!out_metas[4].empty()) && + (!out_metas[4][0].IsStopGradient())) + outs0["Linear1Bias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + + if (pre_layer_norm) { + auto Ln1Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Scale_); + if (Ln1Scale.defined()) + ins0["Ln1Scale"] = egr::EagerUtils::TrySyncToVars(Ln1Scale); + auto Ln1Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Bias_); + if (Ln1Bias.defined()) + ins0["Ln1Bias"] = egr::EagerUtils::TrySyncToVars(Ln1Bias); + auto Ln1Out = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Out_); + if (Ln1Out.defined()) + ins0["Ln1Out"] = egr::EagerUtils::TrySyncToVars(Ln1Out); + auto Ln1Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Mean_); + if (Ln1Mean.defined()) + ins0["Ln1Mean"] = egr::EagerUtils::TrySyncToVars(Ln1Mean); + auto Ln1Variance = + egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Variance_); + if (Ln1Variance.defined()) + ins0["Ln1Variance"] = egr::EagerUtils::TrySyncToVars(Ln1Variance); + if (Ln1Scale.defined() && (!out_metas[7].empty()) && + (!out_metas[7][0].IsStopGradient())) + outs0["Ln1Scale@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + if (Ln1Bias.defined() && (!out_metas[8].empty()) && + (!out_metas[8][0].IsStopGradient())) + outs0["Ln1Bias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + + } else { + auto Ln2Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Scale_); + if (Ln2Scale.defined()) + ins0["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale); + auto Ln2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Bias_); + if (Ln2Bias.defined()) + ins0["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias); + auto Ln2Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Mean_); + if (Ln2Mean.defined()) + ins0["Ln2Mean"] = egr::EagerUtils::TrySyncToVars(Ln2Mean); + auto Ln2Variance = + egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Variance_); + if (Ln2Variance.defined()) + ins0["Ln2Variance"] = egr::EagerUtils::TrySyncToVars(Ln2Variance); + if (Ln2Scale.defined() && (!out_metas[9].empty()) && + (!out_metas[9][0].IsStopGradient())) + outs0["Ln2Scale@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + if (Ln2Bias.defined() && (!out_metas[10].empty()) && + (!out_metas[10][0].IsStopGradient())) + outs0["Ln2Bias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto Linear2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Linear2Bias_); + if (Linear2Bias.defined()) { + ins0["Linear2Bias"] = egr::EagerUtils::TrySyncToVars(Linear2Bias); + if ((!out_metas[6].empty()) && (!out_metas[6][0].IsStopGradient())) + outs0["Linear2Bias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto& attrs_map0 = this->attr_map_; + // Pass the entire attribute map to TraceOp + // The underlying kernel will pickup whatever attribute they need at runtime + egr::Controller::Instance().GetCurrentTracer()->TraceOp( + "fused_feedforward_grad", + ins0, + outs0, + attrs_map0, + egr::Controller::Instance().GetExpectedPlace(), + &this->default_attr_map_, + false, + {}); + + if (outs0.find("Linear1Weight@GRAD") != outs0.end()) { + outputs[3] = egr::EagerUtils::GetOutputs(outs0["Linear1Weight@GRAD"]); + } + if (outs0.find("Linear2Weight@GRAD") != outs0.end()) { + outputs[5] = egr::EagerUtils::GetOutputs(outs0["Linear2Weight@GRAD"]); + } + if (outs0.find("X@GRAD") != outs0.end()) { + outputs[0] = egr::EagerUtils::GetOutputs(outs0["X@GRAD"]); + } + if (outs0.find("Linear1Bias@GRAD") != outs0.end()) { + outputs[4] = egr::EagerUtils::GetOutputs(outs0["Linear1Bias@GRAD"]); + } + + if (pre_layer_norm) { + if (outs0.find("Ln1Scale@GRAD") != outs0.end()) { + outputs[7] = egr::EagerUtils::GetOutputs(outs0["Ln1Scale@GRAD"]); + } + if (outs0.find("Ln1Bias@GRAD") != outs0.end()) { + outputs[8] = egr::EagerUtils::GetOutputs(outs0["Ln1Bias@GRAD"]); + } + + } else { + if (outs0.find("Ln2Bias@GRAD") != outs0.end()) { + outputs[10] = egr::EagerUtils::GetOutputs(outs0["Ln2Bias@GRAD"]); + } + if (outs0.find("Ln2Scale@GRAD") != outs0.end()) { + outputs[9] = egr::EagerUtils::GetOutputs(outs0["Ln2Scale@GRAD"]); + } + } + + if (Linear2Bias.defined()) { + if (outs0.find("Linear2Bias@GRAD") != outs0.end()) { + outputs[6] = egr::EagerUtils::GetOutputs(outs0["Linear2Bias@GRAD"]); + } + } + + if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs); + return outputs; +} diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h index 0f0fac4b725e0..52d3b44d7ba2a 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h @@ -174,3 +174,158 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase { paddle::framework::AttributeMap attr_map_; paddle::framework::AttributeMap default_attr_map_; }; + +class fused_feedforwardGradNodeCompat : public egr::GradNodeBase { + public: + fused_feedforwardGradNodeCompat() : egr::GradNodeBase() { + VLOG(7) << " Construct fused_feedforwardGradNodeCompat "; + } + fused_feedforwardGradNodeCompat(size_t bwd_in_slot_num, + size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { + VLOG(7) << " Construct fused_feedforwardGradNodeCompat "; + } + ~fused_feedforwardGradNodeCompat() override { + VLOG(6) << " Destruct fused_feedforwardGradNodeCompat "; + } + + virtual paddle::small_vector, + egr::kSlotSmallVectorSize> + operator()( + paddle::small_vector, // NOLINT + egr::kSlotSmallVectorSize>& grads, // NOLINT + bool create_graph = false, + bool is_new_grad = false) override; + + void ClearTensorWrappers() override { + Dropout1Mask_.clear(); + Dropout1Out_.clear(); + Dropout2Mask_.clear(); + Dropout2Out_.clear(); + Linear1Bias_.clear(); + Linear1Out_.clear(); + Linear1Weight_.clear(); + Linear2Bias_.clear(); + Linear2Weight_.clear(); + Ln2Bias_.clear(); + Ln2Mean_.clear(); + Ln2Scale_.clear(); + Ln2Variance_.clear(); + X_.clear(); + + SetIsTensorWrappersCleared(true); + } + std::string name() override { return "fused_feedforwardGradNodeCompat"; } + + std::shared_ptr Copy() const override { + { + auto copied_node = std::shared_ptr( + new fused_feedforwardGradNodeCompat(*this)); + return copied_node; + } + } + + // SetX, SetY, ... + void SetTensorWrapperDropout1Mask( + const paddle::experimental::Tensor& Dropout1Mask) { + Dropout1Mask_ = egr::TensorWrapper(Dropout1Mask, false); + } + void SetTensorWrapperDropout1Out( + const paddle::experimental::Tensor& Dropout1Out) { + Dropout1Out_ = egr::TensorWrapper(Dropout1Out, false); + } + void SetTensorWrapperDropout2Mask( + const paddle::experimental::Tensor& Dropout2Mask) { + Dropout2Mask_ = egr::TensorWrapper(Dropout2Mask, false); + } + void SetTensorWrapperDropout2Out( + const paddle::experimental::Tensor& Dropout2Out) { + Dropout2Out_ = egr::TensorWrapper(Dropout2Out, false); + } + void SetTensorWrapperLinear1Bias( + const paddle::experimental::Tensor& Linear1Bias) { + Linear1Bias_ = egr::TensorWrapper(Linear1Bias, false); + } + void SetTensorWrapperLinear1Out( + const paddle::experimental::Tensor& Linear1Out) { + Linear1Out_ = egr::TensorWrapper(Linear1Out, false); + } + void SetTensorWrapperLinear1Weight( + const paddle::experimental::Tensor& Linear1Weight) { + Linear1Weight_ = egr::TensorWrapper(Linear1Weight, false); + } + void SetTensorWrapperLinear2Bias( + const paddle::experimental::Tensor& Linear2Bias) { + Linear2Bias_ = egr::TensorWrapper(Linear2Bias, false); + } + void SetTensorWrapperLinear2Weight( + const paddle::experimental::Tensor& Linear2Weight) { + Linear2Weight_ = egr::TensorWrapper(Linear2Weight, false); + } + void SetTensorWrapperLn2Bias(const paddle::experimental::Tensor& Ln2Bias) { + Ln2Bias_ = egr::TensorWrapper(Ln2Bias, false); + } + void SetTensorWrapperLn2Mean(const paddle::experimental::Tensor& Ln2Mean) { + Ln2Mean_ = egr::TensorWrapper(Ln2Mean, false); + } + void SetTensorWrapperLn2Scale(const paddle::experimental::Tensor& Ln2Scale) { + Ln2Scale_ = egr::TensorWrapper(Ln2Scale, false); + } + void SetTensorWrapperLn2Variance( + const paddle::experimental::Tensor& Ln2Variance) { + Ln2Variance_ = egr::TensorWrapper(Ln2Variance, false); + } + void SetTensorWrapperX(const paddle::experimental::Tensor& X) { + X_ = egr::TensorWrapper(X, false); + } + void SetTensorWrapperLn1Scale(const paddle::experimental::Tensor& Ln1Scale) { + Ln1Scale_ = egr::TensorWrapper(Ln1Scale, false); + } + void SetTensorWrapperLn1Bias(const paddle::experimental::Tensor& Ln1Bias) { + Ln1Bias_ = egr::TensorWrapper(Ln1Bias, false); + } + void SetTensorWrapperLn1Out(const paddle::experimental::Tensor& Ln1Out) { + Ln1Out_ = egr::TensorWrapper(Ln1Out, false); + } + void SetTensorWrapperLn1Mean(const paddle::experimental::Tensor& Ln1Mean) { + Ln1Mean_ = egr::TensorWrapper(Ln1Mean, false); + } + void SetTensorWrapperLn1Variance( + const paddle::experimental::Tensor& Ln1Variance) { + Ln1Variance_ = egr::TensorWrapper(Ln1Variance, false); + } + // SetAttrMap + void SetAttrMap(paddle::framework::AttributeMap&& attr_map) { + attr_map_ = std::move(attr_map); + } + void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) { + default_attr_map_ = std::move(default_attr_map); + } + + private: + // TensorWrappers + egr::TensorWrapper Dropout1Mask_; + egr::TensorWrapper Dropout1Out_; + egr::TensorWrapper Dropout2Mask_; + egr::TensorWrapper Dropout2Out_; + egr::TensorWrapper Linear1Bias_; + egr::TensorWrapper Linear1Out_; + egr::TensorWrapper Linear1Weight_; + egr::TensorWrapper Linear2Bias_; + egr::TensorWrapper Linear2Weight_; + egr::TensorWrapper Ln2Bias_; + egr::TensorWrapper Ln2Mean_; + egr::TensorWrapper Ln2Scale_; + egr::TensorWrapper Ln2Variance_; + egr::TensorWrapper X_; + + egr::TensorWrapper Ln1Scale_; + egr::TensorWrapper Ln1Bias_; + egr::TensorWrapper Ln1Out_; + egr::TensorWrapper Ln1Mean_; + egr::TensorWrapper Ln1Variance_; + + // Attribute Map + paddle::framework::AttributeMap attr_map_; + paddle::framework::AttributeMap default_attr_map_; +}; diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index bbd6ea6494638..6eb35eb13f3f7 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -52,7 +52,7 @@ static std::unordered_set ops_to_fill_zero_for_empty_grads = { /* --- Black Ops list that's NO NEED to apply code generation --- */ static std::unordered_set black_ops_list = { - "run_program", "fused_gate_attention"}; + "run_program", "fused_gate_attention", "fused_feedforward"}; static std::string LegalizeVariableName(const std::string& var_name) { std::string ret = var_name; diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py index 43d39224287e6..8d2873276033a 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py @@ -23,9 +23,7 @@ from paddle.nn.layer.common import Linear, Dropout import unittest from op_test import OpTest -from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph - -_enable_legacy_dygraph() +from paddle.fluid.framework import default_main_program class TestFusedFFNOp(OpTest): From d7f4599d11cf6fec67ecab38129a36d06cac10b5 Mon Sep 17 00:00:00 2001 From: LiYuRio <63526175+LiYuRio@users.noreply.github.com> Date: Wed, 6 Jul 2022 15:09:13 +0800 Subject: [PATCH 074/250] Fix nan in fused multi transformer (#44093) --- paddle/fluid/distributed/store/tcp_store.cc | 5 +- .../fused/fused_multi_transformer_op.cu | 66 +++++++++++++++++-- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc index a67ca29a543ab..e4228e4428d89 100644 --- a/paddle/fluid/distributed/store/tcp_store.cc +++ b/paddle/fluid/distributed/store/tcp_store.cc @@ -125,7 +125,10 @@ void MasterDaemon::CloseControlFd() { void MasterDaemon::StopByControlFd() { VLOG(4) << ("begin to run StopByControlFd"); if (_control_fd[1] != -1) { - ::write(_control_fd[1], "\0", 1); + PADDLE_ENFORCE_NE(::write(_control_fd[1], "\0", 1), + -1, + platform::errors::Fatal( + "failed to write control pipe errno:%d", errno)); // close the write end of the pipe ::close(_control_fd[1]); _control_fd[1] = -1; diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index f806359093cb2..fafbcf724d726 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -294,6 +294,52 @@ inline __device__ uint4 mul(uint4 a, uint4 b) { return c; } +template <> +inline __device__ uint32_t mul(uint32_t a, float b) { + float2 tmp = half2_to_float2(a); + float2 tmp_res; + tmp_res.x = tmp.x * b; + tmp_res.y = tmp.y * b; + uint32_t res = float2_to_half2(tmp_res); + return res; +} + +template <> +inline __device__ uint2 mul(uint2 a, float b) { + uint2 res; + res.x = mul(a.x, b); + res.y = mul(a.y, b); + return res; +} + +template <> +inline __device__ uint4 mul(uint4 a, float b) { + uint4 res; + res.x = mul(a.x, b); + res.y = mul(a.y, b); + res.z = mul(a.z, b); + res.w = mul(a.w, b); + return res; +} + +template <> +inline __device__ float2 mul(float2 a, float b) { + float2 res; + res.x = a.x * b; + res.y = a.y * b; + return res; +} + +template <> +inline __device__ float4 mul(float4 a, float b) { + float4 res; + res.x = a.x * b; + res.y = a.y * b; + res.z = a.z * b; + res.w = a.w * b; + return res; +} + inline __device__ float sum(float v) { return v; } inline __device__ float sum(float2 v) { return v.x + v.y; } inline __device__ float sum(float4 v) { return v.x + v.y + v.z + v.w; } @@ -445,11 +491,15 @@ inline __device__ Float8_ cast_to_float(uint4 u) { } template -inline __device__ float qk_dot_(const K_vec (&q)[N], const K_vec (&k)[N]) { - K_vec qk_vec = mul(q[0], k[0]); +inline __device__ float qk_dot_(const K_vec (&q)[N], + const K_vec (&k)[N], + float inv_sqrt_dh) { + K_vec inv_q = mul(q[0], inv_sqrt_dh); + K_vec qk_vec = mul(inv_q, k[0]); #pragma unroll for (int ii = 1; ii < N; ++ii) { - qk_vec = fma(q[ii], k[ii], qk_vec); + inv_q = mul(q[ii], inv_sqrt_dh); + qk_vec = fma(inv_q, k[ii], qk_vec); } float qk = sum(qk_vec); @@ -463,8 +513,10 @@ inline __device__ float qk_dot_(const K_vec (&q)[N], const K_vec (&k)[N]) { template struct Qk_dot { template - static inline __device__ float dot(const K_vec (&q)[N], const K_vec (&k)[N]) { - return qk_dot_(q, k); + static inline __device__ float dot(const K_vec (&q)[N], + const K_vec (&k)[N], + float inv_sqrt_dh) { + return qk_dot_(q, k, inv_sqrt_dh); } }; @@ -706,7 +758,9 @@ __global__ void masked_multihead_attention_kernel( } } - float qk = Qk_dot::dot(q, k) * params.inv_sqrt_dh; + // NOTE(liyurui): We should multiple q with inv_sqrt_dh first, for dot(q, k) + // may overflow with FP16 in large model. + float qk = Qk_dot::dot(q, k, params.inv_sqrt_dh); // bool is_mask = false; if (ti < params.timestep && tid % THREADS_PER_KEY == 0) { From 81fd2fff617e258e777dd520a6c826039b0eb222 Mon Sep 17 00:00:00 2001 From: handiz <35895648+ZhangHandi@users.noreply.github.com> Date: Wed, 6 Jul 2022 16:38:48 +0800 Subject: [PATCH 075/250] fix quant scale name (#44116) --- .../slim/quantization/post_training_quantization.py | 4 ++-- .../contrib/slim/quantization/quantization_pass.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index f1da3990a36be..a46a0d12fddea 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -963,10 +963,10 @@ def _update_program(self): else: scale_dict = self._quantized_threshold for key, val in scale_dict.items(): - utils.set_variable_data(self._scope, self._place, key + ".scale", + utils.set_variable_data(self._scope, self._place, key + "@scale", np.array([val], dtype=np.float32)) utils.set_variable_data(self._scope, self._place, - key + ".quant_dequant.scale", + key + ".quant_dequant@scale", np.array([val], dtype=np.float32)) if not self._onnx_format: diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index 3a316e9192e39..d3ce543320ef4 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -906,7 +906,7 @@ def _quantized_scale_name(self, var_name): """ Return the scale name of quantized variable for the input `var_name`. """ - return "%s.scale" % (var_name) + return "%s@scale" % (var_name) def _is_skip_quant(self, graph, op_node): """ @@ -1246,8 +1246,8 @@ def _original_var_name(self, var_name): return var_name[:-len('.quantized')] if var_name.endswith('.dequantized'): return var_name[:-len('.dequantized')] - if var_name.endswith('.scale'): - return var_name[:-len('.scale')] + if var_name.endswith('@scale'): + return var_name[:-len('@scale')] else: return var_name @@ -1705,7 +1705,7 @@ def _inser_quant_dequant_moving_average_abs_max_op(self, graph, var_node, shape=var_node.shape(), var_dtype=var_node.dtype()) scale_in_node = graph.create_persistable_node( - name="{}.quant_dequant.scale".format(var_node.name()), + name="{}.quant_dequant@scale".format(var_node.name()), var_type=core.VarDesc.VarType.LOD_TENSOR, shape=[1], var_dtype=var_node.dtype()) @@ -1922,7 +1922,7 @@ def _quantized_scale_name(self, var_name): """ Return the scale name of quantized variable for the input `var_name`. """ - return "%s.scale" % (var_name) + return "%s@scale" % (var_name) def _zero_point_name(self, var_name): """ From 6eed9f4994ceab781502f36cd36dbd15aac0db34 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 6 Jul 2022 03:46:25 -0500 Subject: [PATCH 076/250] Refine StandaloneExecutor (#44076) * not run startup program in constructor of StandaloneExecutor * clear interface of standalone executor * clean debug code --- .../new_executor/executor_statistics.cc | 3 +- .../new_executor/standalone_executor.cc | 35 +-- .../new_executor/standalone_executor.h | 15 +- paddle/fluid/pybind/pybind.cc | 49 +--- python/paddle/fluid/executor.py | 17 +- python/paddle/fluid/framework.py | 13 + .../test_standalone_controlflow.py | 14 +- .../interpreter/test_standalone_executor.py | 258 +++++------------- .../unittests/mkldnn/test_conv2d_mkldnn_op.py | 10 - 9 files changed, 107 insertions(+), 307 deletions(-) diff --git a/paddle/fluid/framework/new_executor/executor_statistics.cc b/paddle/fluid/framework/new_executor/executor_statistics.cc index c9bb7d4555fc0..a381943587d03 100644 --- a/paddle/fluid/framework/new_executor/executor_statistics.cc +++ b/paddle/fluid/framework/new_executor/executor_statistics.cc @@ -583,7 +583,8 @@ int StatisticsEngine::StatNormalizationTime( if (total - normalization_sum != 0) { LOG(WARNING) << "total: " << total << "is greater than normalization_sum:" << normalization_sum; - return -1; + // TODO(dev): figure out why total != normalization_sum and fix it + // return -1; } return 0; } diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 31b1627dc650a..2e6e9aa8427b0 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -19,34 +19,8 @@ namespace paddle { namespace framework { StandaloneExecutor::StandaloneExecutor(const platform::Place& place, - const ProgramDesc& startup_prog, - const ProgramDesc& main_prog, - Scope* scope) - : place_(place), - startup_prog_(startup_prog), - main_prog_(main_prog), - scope_(scope) { - // NOTE(zhiqiu): for startup_program, run once ? - if (startup_prog.Block(0).AllOps().size() > 0) { - auto core = GetInterpreterCore(scope, startup_prog, {}, {}, false); - VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core; - core->Run({}); - } -} - -paddle::framework::FetchList StandaloneExecutor::Run( - Scope* scope, - const std::vector& feed_names, - const std::vector& feed_tensors, - const std::vector& fetch_names) { - platform::RecordEvent record_event( - "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1); - - auto core = - GetInterpreterCore(scope, main_prog_, feed_names, fetch_names, true); - - return core->Run(feed_names, feed_tensors); -} + const ProgramDesc& prog) + : place_(place), prog_(prog) {} paddle::framework::FetchList StandaloneExecutor::Run( Scope* scope, @@ -55,8 +29,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( platform::RecordEvent record_event( "StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1); - auto core = - GetInterpreterCore(scope, main_prog_, feed_names, fetch_names, false); + auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names, false); VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core; return core->Run(feed_names); } @@ -65,7 +38,7 @@ framework::interpreter::CostInfo StandaloneExecutor::DryRun( Scope* scope, const std::vector& feed_names, const std::vector& feed_tensors) { - auto core = GetInterpreterCore(scope, main_prog_, feed_names, {}, true); + auto core = GetInterpreterCore(scope, prog_, feed_names, {}, true); return core->DryRun(feed_names, feed_tensors); } diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index 5b9c48009ea83..e6d84d6f9a183 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -31,19 +31,10 @@ class InterpreterCore; class StandaloneExecutor { public: - StandaloneExecutor(const platform::Place& place, - const ProgramDesc& startup_prog, - const ProgramDesc& main_prog, - Scope* scope); + StandaloneExecutor(const platform::Place& place, const ProgramDesc& prog); ~StandaloneExecutor() {} - paddle::framework::FetchList Run( - Scope* scope, - const std::vector& feed_names, - const std::vector& feed_tensors, - const std::vector& fetch_names); - // NOTE(zhiqiu): feed_names are only used for caching interpretercore. // fetch_names are used for caching interpretercore and inserting fetch ops, // the latter can be moved to python side. @@ -65,9 +56,7 @@ class StandaloneExecutor { bool add_fetch_op); platform::Place place_; - const ProgramDesc& startup_prog_; - const ProgramDesc& main_prog_; - Scope* scope_; // not owned + const ProgramDesc& prog_; std::unordered_map> interpretercores_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3723e58e52902..abbcacec3858e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3057,54 +3057,7 @@ All parameter, weight, gradient are variables in Paddle. }); py::class_(m, "StandaloneExecutor") - .def(py::init()) - .def("run", - [](StandaloneExecutor &self, - Scope *scope, - const std::unordered_map &input_dict, - std::vector fetch_names) { - std::vector feed_tensors; - std::vector feed_names; - - for (auto &item : input_dict) { - framework::LoDTensor t; - SetTensorFromPyArray( - &t, item.second, platform::CPUPlace(), false); - feed_names.push_back(item.first); - feed_tensors.push_back(t); - } - - paddle::framework::FetchList ret; - { - pybind11::gil_scoped_release release; - ret = self.Run(scope, feed_names, feed_tensors, fetch_names); - } - return py::cast(std::move(ret)); - }) - .def("run", - [](StandaloneExecutor &self, - Scope *scope, - const std::unordered_map - &input_dict, - std::vector fetch_names) { - std::vector feed_tensors; - std::vector feed_names; - - for (auto &item : input_dict) { - feed_names.push_back(item.first); - feed_tensors.push_back(item.second); - } - - paddle::framework::FetchList ret; - { - pybind11::gil_scoped_release release; - ret = self.Run(scope, feed_names, feed_tensors, fetch_names); - } - return py::cast(std::move(ret)); - }) + .def(py::init()) .def("run", [](StandaloneExecutor &self, Scope *scope, diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 78c3f413966e9..3303b6c9472ff 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -25,6 +25,7 @@ from .data_feeder import convert_dtype from .framework import Program, default_main_program, Variable, Operator from .framework import convert_np_dtype_to_dtype_ + from . import core from . import unique_name from . import compiler @@ -397,15 +398,12 @@ def _is_enable_standalone_executor(): Whether to use experimental executor `StandaloneExecutor`. """ flag = False - from ..distributed.fleet import fleet - if fleet._role_maker is not None: - warnings.warn("do not use standalone executor in fleet by default") - env_val = os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR', None) - else: - env_val = os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR', '1') + # use standalone_executor by default if not distributed + if fleet._role_maker is None and framework._enable_standalone_executor_ is None: + framework._enable_standalone_executor_ = 1 - if env_val in [1, '1', True, 'True', 'true']: + if framework._enable_standalone_executor_ in [1, '1', True, 'True', 'true']: flag = True return flag @@ -569,10 +567,7 @@ def run(self, scope, feed_names, fetch_list, return_numpy=True): return tensors def _create_new_executor(self): - # NOTE: It's a trick to set empty start_up program. - startup_program = Program() - new_exe = core.StandaloneExecutor(self._place, startup_program.desc, - self._main_program.desc, self._scope) + new_exe = core.StandaloneExecutor(self._place, self._main_program.desc) return new_exe diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index df4691d49e891..d6e4af586699b 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -84,6 +84,8 @@ _already_patch_varbase = False _current_cuda_graph_mode = None _global_flags_ = core.globals() +_enable_standalone_executor_ = (os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR', + None)) # Some explanation of our execution system 2022.03 # For now we have 3 kinds of execution system, since we refactored dygraph mode to @@ -259,6 +261,17 @@ def _test_eager_guard(place=None): ipu_stage_attr_name = 'ipu_stage' +@signature_safe_contextmanager +def _enable_standalone_executor(enable=True): + global _enable_standalone_executor_ + original_ = _enable_standalone_executor_ + _enable_standalone_executor_ = enable + try: + yield + finally: + _enable_standalone_executor_ = original_ + + @signature_safe_contextmanager def ipu_shard_guard(index=-1, stage=-1): """ diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py index 5ce035097d01a..aa0290cf4b5fa 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_controlflow.py @@ -16,7 +16,7 @@ import sys import unittest import paddle -from paddle.fluid import core +from paddle.fluid import core, framework from paddle.fluid.core import StandaloneExecutor import paddle.fluid as fluid from paddle.fluid.framework import Program, program_guard @@ -81,17 +81,13 @@ def _run(self, feed): return ret def run_raw_executor(self, feed): - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0' - out = self._run(feed) - del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] - print("GT:", out) + with framework._enable_standalone_executor(False): + out = self._run(feed) return out def run_new_executor(self, feed): - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' - out = self._run(feed) - del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] - print("New:", out) + with framework._enable_standalone_executor(True): + out = self._run(feed) return out def test_with_feed(self): diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py index f1b1bc118eb30..ad13061d17802 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py +++ b/python/paddle/fluid/tests/unittests/interpreter/test_standalone_executor.py @@ -20,7 +20,7 @@ import unittest import paddle import json -from paddle.fluid import core +from paddle.fluid import core, framework from paddle.fluid.core import StandaloneExecutor from paddle.profiler import profiler @@ -29,7 +29,7 @@ paddle.enable_static() -class LinearTestCase(unittest.TestCase): +class TestDryRun(unittest.TestCase): def setUp(self): place = paddle.CUDAPlace( @@ -48,29 +48,13 @@ def build_program(self): return startup_program, main_program, c - def test_interp_base(self): - startup_program, main_program, c = self.build_program() - scope = core.Scope() - standaloneexecutor = StandaloneExecutor(self.place, - startup_program.desc, - main_program.desc, scope) - out = standaloneexecutor.run( - scope, {"a": np.ones([2, 2], dtype="float32") * 2}, [c.name]) - for i in range(10): - out = standaloneexecutor.run( - scope, {"a": np.ones([2, 2], dtype="float32") * i}, [c.name]) - - for i in range(10): - out = standaloneexecutor.run( - scope, {"a": np.ones([2, 2], dtype="float32") * i}, - ['a', c.name]) - def test_dry_run(self): scope = core.Scope() startup_program, main_program, c = self.build_program() - standaloneexecutor = StandaloneExecutor(self.place, - startup_program.desc, - main_program.desc, scope) + exe = paddle.static.Executor(self.place) + exe.run(startup_program, scope=scope) + + standaloneexecutor = StandaloneExecutor(self.place, main_program.desc) # test for cost_info cost_info = standaloneexecutor.dry_run( scope, {"a": np.ones([2, 2], dtype="float32")}) @@ -124,100 +108,49 @@ def setUp(self): self.iter_n = 3 self.place = paddle.CUDAPlace( 0) if core.is_compiled_with_cuda() else paddle.CPUPlace() - - def test_standalone_executor_statistics(self): - if os.getenv("FLAGS_static_executor_perfstat_filepath") is None: - return - - paddle.seed(2020) - main_program, startup_program, fetch_list = build_program() - fetch_list = [x.name for x in fetch_list] - - p = core.Place() - p.set_place(self.place) - scope = core.Scope() - executor = StandaloneExecutor(p, startup_program.desc, - main_program.desc, scope) - - helper_profiler = profiler.Profiler( - targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2)) - helper_profiler.start() - for i in range(self.iter_n): - executor.run(scope, {}, fetch_list) - helper_profiler.step() - helper_profiler.stop() - - perfstat_filepath = os.environ[ - 'FLAGS_static_executor_perfstat_filepath'] - self.assertTrue(os.path.exists(perfstat_filepath)) - with open(perfstat_filepath, 'r') as load_f: - stat_res = json.load(load_f) - self.assertTrue(len(stat_res) > 0) - - os.remove(perfstat_filepath) - shutil.rmtree('./profiler_log') + self.perf_path = './perfstat' def test_parallel_executor_statistics(self): - if os.getenv("FLAGS_static_executor_perfstat_filepath") is None: - return + self.run_with_statistics(executor='ParallelExecutor') - paddle.seed(2020) - main_program, startup_program, fetch_list = build_program() - fetch_list = [x.name for x in fetch_list] - - main_program = paddle.fluid.compiler.CompiledProgram(main_program) - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0' - executor = paddle.static.Executor(self.place) - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' - executor.run(startup_program) - - helper_profiler = profiler.Profiler( - targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2)) - helper_profiler.start() - for i in range(self.iter_n): - executor.run(main_program, fetch_list=fetch_list) - helper_profiler.step() - helper_profiler.stop() - - perfstat_filepath = os.environ[ - 'FLAGS_static_executor_perfstat_filepath'] - self.assertTrue(os.path.exists(perfstat_filepath)) - with open(perfstat_filepath, 'r') as load_f: - stat_res = json.load(load_f) - self.assertTrue(len(stat_res) > 0) + def test_executor_statistics(self): + self.run_with_statistics(executor='Executor') - os.remove(perfstat_filepath) - shutil.rmtree('./profiler_log') + def test_standalone_executor_statistics(self): + self.run_with_statistics(executor='StandaloneExecutor') - def test_executor_statistics(self): + def run_with_statistics(self, executor=None): if os.getenv("FLAGS_static_executor_perfstat_filepath") is None: return - paddle.seed(2020) + # note: startup program is empty main_program, startup_program, fetch_list = build_program() - fetch_list = [x.name for x in fetch_list] - - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '0' - executor = paddle.static.Executor(self.place) - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' - executor.run(startup_program) - - helper_profiler = profiler.Profiler( - targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2)) - helper_profiler.start() - for i in range(self.iter_n): - executor.run(main_program, fetch_list=fetch_list) - helper_profiler.step() - helper_profiler.stop() - - perfstat_filepath = os.environ[ - 'FLAGS_static_executor_perfstat_filepath'] - self.assertTrue(os.path.exists(perfstat_filepath)) - with open(perfstat_filepath, 'r') as load_f: + + enable = True + if executor == 'ParallelExecutor': + main_program = paddle.fluid.compiler.CompiledProgram(main_program) + enable = False + elif executor == 'Executor': + enable = False + + scope = paddle.static.Scope() + with paddle.static.scope_guard(scope): + with framework._enable_standalone_executor(enable): + exe = paddle.static.Executor(self.place) + helper_profiler = profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], scheduler=(1, 2)) + helper_profiler.start() + for i in range(self.iter_n): + exe.run(main_program, fetch_list=fetch_list) + helper_profiler.step() + helper_profiler.stop() + + self.assertTrue(os.path.exists(self.perf_path)) + with open(self.perf_path, 'r') as load_f: stat_res = json.load(load_f) self.assertTrue(len(stat_res) > 0) - os.remove(perfstat_filepath) + os.remove(self.perf_path) shutil.rmtree('./profiler_log') @@ -229,59 +162,24 @@ def setUp(self): 0) if core.is_compiled_with_cuda() else paddle.CPUPlace() def test_result(self): - ground_truths = self.run_raw_executor() - res = self.run_new_executor() + ground_truths = self.run_test(False) + res = self.run_test(True) for gt, out in zip(ground_truths, res): self.assertEqual(gt[0], out[0]) - def run_raw_executor(self): + def run_test(self, use_new_executor=True): paddle.seed(2020) main_program, startup_program, fetch_list = build_program() - exe = paddle.static.Executor(self.place) - exe.run(startup_program) - - outs = [] - for i in range(self.iter_n): - outs.append(exe.run(main_program, fetch_list=fetch_list)) - - return outs - - def run_new_executor(self): - paddle.seed(2020) - main_program, startup_program, fetch_list = build_program() - fetch_list = [x.name for x in fetch_list] - - p = core.Place() - p.set_place(self.place) - scope = core.Scope() - inter_core = StandaloneExecutor(p, startup_program.desc, - main_program.desc, scope) - - outs = [] - for i in range(self.iter_n): - outs.append( - np.array( - inter_core.run(scope, {}, fetch_list)._move_to_list()[0])) - return outs - - -class SwitchExecutorInterfaceTestCase(MultiStreamModelTestCase): - - def run_new_executor(self): - paddle.seed(2020) - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' - main_program, startup_program, fetch_list = build_program() - exe = paddle.static.Executor(self.place) - exe.run(startup_program) - - outs = [] - for i in range(self.iter_n): - outs.append(exe.run(main_program, fetch_list=fetch_list)) - - del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] - + with framework._enable_standalone_executor(use_new_executor): + scope = core.Scope() + exe = paddle.static.Executor(self.place) + outs = [] + for i in range(self.iter_n): + outs.append( + exe.run(main_program, scope=scope, fetch_list=fetch_list)) + print(outs) return outs @@ -337,23 +235,23 @@ def _run(self, return outs def run_raw_executor(self, feed, use_compiled=False): - # run construct program 1 - out1 = self._run(feed, - use_str=False, - is_double=False, - use_compiled=use_compiled) - # run construct program 2 with same executor - out2 = self._run(feed, - use_str=True, - is_double=True, - use_compiled=use_compiled) - - return [out1, out2] + with framework._enable_standalone_executor(False): + # run construct program 1 + out1 = self._run(feed, + use_str=False, + is_double=False, + use_compiled=use_compiled) + # run construct program 2 with same executor + out2 = self._run(feed, + use_str=True, + is_double=True, + use_compiled=use_compiled) + + return [out1, out2] def run_new_executor(self, feed, use_compiled=False): - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' - out = self.run_raw_executor(feed, use_compiled=use_compiled) - del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] + with framework._enable_standalone_executor(): + out = self.run_raw_executor(feed, use_compiled=use_compiled) return out def test_with_feed(self): @@ -369,9 +267,8 @@ def test_with_error(self): feed = [{'a': np.ones([2, 2], dtype="float32")}] with self.assertRaises(TypeError): - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' - self._run(feed[0], add_wrong_fetch=True) - del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] + with framework._enable_standalone_executor(): + self._run(feed[0], add_wrong_fetch=True) def test_compiled_program(self): data = np.ones([2, 2], dtype="float32") @@ -386,9 +283,7 @@ def test_compiled_program_convert_graph_to_program(self): data = np.ones([2, 2], dtype="float32") feed = {"a": data} - os.environ['FLAGS_CONVERT_GRAPH_TO_PROGRAM'] = '1' res = self.run_new_executor(feed, use_compiled=True) - del os.environ['FLAGS_CONVERT_GRAPH_TO_PROGRAM'] gt = self.run_raw_executor(feed, use_compiled=True) for x, y in zip(gt, res): self.assertTrue(np.array_equal(x, y)) @@ -401,9 +296,8 @@ def test_empty_program(self): for i in range(10): print(i, flush=1) - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' - out = exe.run(program, feed=None) - del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] + with framework._enable_standalone_executor(): + out = exe.run(program, feed=None) class TestException(unittest.TestCase): @@ -437,14 +331,12 @@ def _run(self, feeds): for feed in feeds: out = exe.run(main_program, feed=feed, fetch_list=fetch_vars) - print(main_program) self.fetch_vars = fetch_vars return out def run_new_executor(self, feed): - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' - out = self._run(feed) - del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] + with framework._enable_standalone_executor(): + out = self._run(feed) return out def test_exception(self): @@ -492,14 +384,12 @@ def test_increment(self): with paddle.fluid.device_guard("cpu"): x = paddle.increment(x) exe = paddle.static.Executor(paddle.CUDAPlace(0)) - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' - - for i in range(10): - a, = exe.run(paddle.static.default_main_program(), - fetch_list=[x]) - self.assertEqual(a[0], 1) + with framework._enable_standalone_executor(): - del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] + for i in range(10): + a, = exe.run(paddle.static.default_main_program(), + fetch_list=[x]) + self.assertEqual(a[0], 1) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py index 0471c295ad45d..91487fb0ab64d 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py @@ -246,16 +246,6 @@ def init_group(self): self.groups = 3 -# TODO(chenweihang): To solve the coverage problem, add this unittest, -# remove this unittest after new executor set to default executor -class TestConv2dMKLDNNByNewExecutor(TestConv2DMKLDNNOp): - - def test_check_output_by_new_executor(self): - os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1' - self.test_check_output() - del os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] - - if __name__ == '__main__': from paddle import enable_static enable_static() From 064e549bdd54806a80ecb9e40306a853d2a5af13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Wed, 6 Jul 2022 17:15:14 +0800 Subject: [PATCH 077/250] force single thread when CINN is on (#44103) --- python/paddle/fluid/compiler.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index 06f206c36d111..38393311de5f8 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -16,6 +16,7 @@ import os import six import sys +import warnings from .. import compat as cpt from . import framework from .framework import _get_paddle_place, _get_paddle_place_list @@ -373,6 +374,12 @@ def _compile_data_parallel(self, places, use_device, scope=None): else: self._exec_strategy.num_threads = len(places) * 2 + if "FLAGS_use_cinn" in core.globals() and core.globals( + )["FLAGS_use_cinn"] and self._exec_strategy.num_threads != 1: + warnings.warn("At present, when CINN is turned on, each process can " \ + "only contain one thread, so reset the number of threads to 1 here.") + self._exec_strategy.num_threads = 1 + if self._build_strategy.num_trainers > 1: assert self._is_data_parallel, \ "If you use multi-trainer to train the model, you should use "\ From 9c32099d050713d1416fb59750761316f5f0831a Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Wed, 6 Jul 2022 17:55:56 +0800 Subject: [PATCH 078/250] [Sparse] support optional kp_mask/attn_mask of sparse attention (#44120) --- .../phi/api/yaml/generator/sparse_api_gen.py | 16 ++-- paddle/phi/api/yaml/sparse_api.yaml | 2 + paddle/phi/api/yaml/sparse_bw_api.yaml | 2 + .../sparse/cpu/fused_attention_kernel.cc | 19 +++-- .../kernels/sparse/fused_attention_kernel.h | 19 +++-- .../sparse/gpu/fused_attention_kernel.cu | 85 ++++++++++--------- .../test_sparse_fused_attention_op.py | 75 +++++++++------- .../sparse/nn/functional/transformer.py | 12 +-- 8 files changed, 132 insertions(+), 98 deletions(-) diff --git a/paddle/phi/api/yaml/generator/sparse_api_gen.py b/paddle/phi/api/yaml/generator/sparse_api_gen.py index 17eb70e5c3e5f..69bf6950cd822 100644 --- a/paddle/phi/api/yaml/generator/sparse_api_gen.py +++ b/paddle/phi/api/yaml/generator/sparse_api_gen.py @@ -111,9 +111,8 @@ def gen_sparse_kernel_context(self, kernel_output_names): for param in kernel_param: if param in input_names: if param in self.optional_vars: - raise ValueError( - f"{self.api} : Unsupport optional input({param}) for sparse api." - ) + kernel_context_code = kernel_context_code + f""" + kernel_context.EmplaceBackInput({param} ? {param}->impl().get() : nullptr);""" else: kernel_context_code = kernel_context_code + f""" kernel_context.EmplaceBackInput({param}.impl().get());""" @@ -170,9 +169,14 @@ def get_condition_code(self, kernel_name): condition_list = [] for i, in_type in enumerate(input_types): if in_type == "dense": - condition_list.append( - f"phi::DenseTensor::classof({self.inputs['names'][i]}.impl().get())" - ) + if self.inputs['names'][i] in self.optional_vars: + condition_list.append( + f"(!{self.inputs['names'][i]} || phi::DenseTensor::classof({self.inputs['names'][i]}->impl().get()))" + ) + else: + condition_list.append( + f"phi::DenseTensor::classof({self.inputs['names'][i]}.impl().get())" + ) else: condition_list.append( f"{self.inputs['names'][i]}.layout() == {sparse_type_map[in_type]}" diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index a6520a0d48472..68c41d50ae5ff 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -147,6 +147,8 @@ kernel : func : fused_attention_csr{dense, dense, dense, sparse_csr, dense, dense -> dense, sparse_csr} layout : sparse_mask + data_type: query + optional : key_padding_mask, attn_mask intermediate : softmax backward: fused_attention_grad diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml index 5296d1b870bee..0ca9c9daa9a5a 100644 --- a/paddle/phi/api/yaml/sparse_bw_api.yaml +++ b/paddle/phi/api/yaml/sparse_bw_api.yaml @@ -134,3 +134,5 @@ output : Tensor(query_grad), Tensor(key_grad), Tensor(value_grad) kernel : func : fused_attention_csr_grad{dense, dense, dense, sparse_csr, dense -> dense, dense, dense} + layout : softmax + data_type: query diff --git a/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc b/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc index 6c652c6a8c4d6..11c9e2d5c2007 100644 --- a/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/fused_attention_kernel.cc @@ -21,15 +21,16 @@ namespace phi { namespace sparse { template -void FusedAttentionCsrKernel(const Context& dev_ctx, - const DenseTensor& query, - const DenseTensor& key, - const DenseTensor& value, - const SparseCsrTensor& sparse_mask, - const DenseTensor& key_padding_mask, - const DenseTensor& attn_mask, - DenseTensor* out, - SparseCsrTensor* softmax) { +void FusedAttentionCsrKernel( + const Context& dev_ctx, + const DenseTensor& query, + const DenseTensor& key, + const DenseTensor& value, + const SparseCsrTensor& sparse_mask, + const paddle::optional& key_padding_mask, + const paddle::optional& attn_mask, + DenseTensor* out, + SparseCsrTensor* softmax) { PD_THROW( "Not support CPU kernel of 'sparse.nn.functional.fused_attention' now"); } diff --git a/paddle/phi/kernels/sparse/fused_attention_kernel.h b/paddle/phi/kernels/sparse/fused_attention_kernel.h index feff9d72e644c..340fdce0196c3 100644 --- a/paddle/phi/kernels/sparse/fused_attention_kernel.h +++ b/paddle/phi/kernels/sparse/fused_attention_kernel.h @@ -21,15 +21,16 @@ namespace phi { namespace sparse { template -void FusedAttentionCsrKernel(const Context& dev_ctx, - const DenseTensor& query, - const DenseTensor& key, - const DenseTensor& value, - const SparseCsrTensor& sparse_mask, - const DenseTensor& key_padding_mask, - const DenseTensor& attn_mask, - DenseTensor* out, - SparseCsrTensor* softmax); +void FusedAttentionCsrKernel( + const Context& dev_ctx, + const DenseTensor& query, + const DenseTensor& key, + const DenseTensor& value, + const SparseCsrTensor& sparse_mask, + const paddle::optional& key_padding_mask, + const paddle::optional& attn_mask, + DenseTensor* out, + SparseCsrTensor* softmax); } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu index 9a7e55d2d6210..46412d57f16c7 100644 --- a/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu @@ -127,15 +127,16 @@ __global__ void AttnSoftmaxGpuKernel(const int64_t* x_crows, } template -void FusedAttentionCsrKernel(const Context& dev_ctx, - const DenseTensor& query, - const DenseTensor& key, - const DenseTensor& value, - const SparseCsrTensor& sparse_mask, - const DenseTensor& key_padding_mask, - const DenseTensor& attn_mask, - DenseTensor* out, - SparseCsrTensor* softmax) { +void FusedAttentionCsrKernel( + const Context& dev_ctx, + const DenseTensor& query, + const DenseTensor& key, + const DenseTensor& value, + const SparseCsrTensor& sparse_mask, + const paddle::optional& key_padding_mask, + const paddle::optional& attn_mask, + DenseTensor* out, + SparseCsrTensor* softmax) { #if CUDA_VERSION >= 11070 /* Check Shape */ auto q_dim = query.dims(); @@ -183,34 +184,40 @@ void FusedAttentionCsrKernel(const Context& dev_ctx, phi::errors::InvalidArgument("dense shape of 'sparse_mask' must be " "[batch_size*num_heads, seq_len, seq_len]")); - PADDLE_ENFORCE_EQ( - key_padding_mask.dims().size(), - 2, - phi::errors::InvalidArgument( - "shape of 'key_padding_mask' must be [batch_size, seq_len]")); - PADDLE_ENFORCE_EQ( - key_padding_mask.dims()[0], - q_dim[0], - phi::errors::InvalidArgument( - "shape of 'key_padding_mask' must be [batch_size, seq_len]")); - PADDLE_ENFORCE_EQ( - key_padding_mask.dims()[1], - M, - phi::errors::InvalidArgument( - "shape of 'key_padding_mask' must be [batch_size, seq_len]")); - - PADDLE_ENFORCE_EQ(attn_mask.dims().size(), - 2, - phi::errors::InvalidArgument( - "shape of 'attn_mask' must be [seq_len, seq_len]")); - PADDLE_ENFORCE_EQ(attn_mask.dims()[0], - M, - phi::errors::InvalidArgument( - "shape of 'attn_mask' must be [seq_len, seq_len]")); - PADDLE_ENFORCE_EQ(attn_mask.dims()[1], - M, - phi::errors::InvalidArgument( - "shape of 'attn_mask' must be [seq_len, seq_len]")); + const auto kp_mask_ptr = key_padding_mask.get_ptr(); + if (kp_mask_ptr) { + PADDLE_ENFORCE_EQ( + kp_mask_ptr->dims().size(), + 2, + phi::errors::InvalidArgument( + "shape of 'key_padding_mask' must be [batch_size, seq_len]")); + PADDLE_ENFORCE_EQ( + kp_mask_ptr->dims()[0], + q_dim[0], + phi::errors::InvalidArgument( + "shape of 'key_padding_mask' must be [batch_size, seq_len]")); + PADDLE_ENFORCE_EQ( + kp_mask_ptr->dims()[1], + M, + phi::errors::InvalidArgument( + "shape of 'key_padding_mask' must be [batch_size, seq_len]")); + } + + const auto attn_mask_ptr = attn_mask.get_ptr(); + if (attn_mask_ptr) { + PADDLE_ENFORCE_EQ(attn_mask_ptr->dims().size(), + 2, + phi::errors::InvalidArgument( + "shape of 'attn_mask' must be [seq_len, seq_len]")); + PADDLE_ENFORCE_EQ(attn_mask_ptr->dims()[0], + M, + phi::errors::InvalidArgument( + "shape of 'attn_mask' must be [seq_len, seq_len]")); + PADDLE_ENFORCE_EQ(attn_mask_ptr->dims()[1], + M, + phi::errors::InvalidArgument( + "shape of 'attn_mask' must be [seq_len, seq_len]")); + } /* Step1: SDD Matmul, reuse */ SparseCsrTensor sdd_result; @@ -244,8 +251,8 @@ void FusedAttentionCsrKernel(const Context& dev_ctx, sdd_result.non_zero_crows().data(), sdd_result.non_zero_cols().data(), sdd_result.non_zero_elements().data(), - key_padding_mask.data(), - attn_mask.data(), + kp_mask_ptr ? kp_mask_ptr->data() : nullptr, + attn_mask_ptr ? attn_mask_ptr->data() : nullptr, softmax->mutable_non_zero_elements()->data(), M, total_row_num, diff --git a/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py index e34f890cc53d4..0383247886ff2 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py @@ -47,6 +47,7 @@ def setUp(self): self.seq_len = 128 self.head_dim = 16 self.dtype = 'float64' + self.use_mask = True def test_dygraph(self): with _test_eager_guard(): @@ -69,37 +70,49 @@ def test_dygraph(self): sp_mask = mask.reshape([-1, self.seq_len, self.seq_len]).to_sparse_csr() - kp_mask = paddle.randint( - 0, 2, [self.batch_size, self.seq_len]).astype(self.dtype) - attn_mask = paddle.randint( - 0, 2, [self.seq_len, self.seq_len]).astype(self.dtype) - - sdd = paddle.matmul(query, key, False, True) / math.sqrt( - float(self.head_dim)) - sdd = sdd + ( - (mask * kp_mask.unsqueeze([1, 2]) * attn_mask) - 1.0) * 1e9 - softmax = paddle.nn.functional.softmax(sdd) - output = paddle.matmul(softmax, value) - output.backward() - - query_cp = copy.deepcopy(query) - key_cp = copy.deepcopy(key) - value_cp = copy.deepcopy(value) - - query_cp.stop_gradient = False - key_cp.stop_gradient = False - value_cp.stop_gradient = False - - output_cp = paddle.incubate.sparse.nn.functional.attention( - query_cp, key_cp, value_cp, sp_mask, kp_mask, attn_mask) - output_cp.backward() - - self.assertTrue(np.allclose(output_cp.numpy(), output.numpy())) + query_sp = copy.deepcopy(query) + key_sp = copy.deepcopy(key) + value_sp = copy.deepcopy(value) + + query_sp.stop_gradient = False + key_sp.stop_gradient = False + value_sp.stop_gradient = False + + if self.use_mask: + kp_mask = paddle.randint( + 0, 2, [self.batch_size, self.seq_len]).astype(self.dtype) + attn_mask = paddle.randint( + 0, 2, [self.seq_len, self.seq_len]).astype(self.dtype) + + sdd = paddle.matmul(query, key, False, True) / math.sqrt( + float(self.head_dim)) + sdd = sdd + ( + (mask * kp_mask.unsqueeze([1, 2]) * attn_mask) - 1.0) * 1e9 + softmax = paddle.nn.functional.softmax(sdd) + output = paddle.matmul(softmax, value) + output.backward() + + output_sp = paddle.incubate.sparse.nn.functional.attention( + query_sp, key_sp, value_sp, sp_mask, kp_mask, attn_mask) + output_sp.backward() + else: + sdd = paddle.matmul(query, key, False, True) / math.sqrt( + float(self.head_dim)) + sdd = sdd + (mask - 1.0) * 1e9 + softmax = paddle.nn.functional.softmax(sdd) + output = paddle.matmul(softmax, value) + output.backward() + + output_sp = paddle.incubate.sparse.nn.functional.attention( + query_sp, key_sp, value_sp, sp_mask) + output_sp.backward() + + self.assertTrue(np.allclose(output_sp.numpy(), output.numpy())) self.assertTrue( - np.allclose(query_cp.grad.numpy(), query.grad.numpy())) - self.assertTrue(np.allclose(key_cp.grad.numpy(), key.grad.numpy())) + np.allclose(query_sp.grad.numpy(), query.grad.numpy())) + self.assertTrue(np.allclose(key_sp.grad.numpy(), key.grad.numpy())) self.assertTrue( - np.allclose(value_cp.grad.numpy(), value.grad.numpy())) + np.allclose(value_sp.grad.numpy(), value.grad.numpy())) class TestSparseAttentionAPI2(TestSparseAttentionAPI1): @@ -110,6 +123,7 @@ def setUp(self): self.seq_len = 128 self.head_dim = 32 self.dtype = 'float64' + self.use_mask = False class TestSparseAttentionAPI3(TestSparseAttentionAPI1): @@ -120,6 +134,7 @@ def setUp(self): self.seq_len = 512 self.head_dim = 16 self.dtype = 'float64' + self.use_mask = True class TestSparseAttentionAPI4(TestSparseAttentionAPI1): @@ -130,6 +145,7 @@ def setUp(self): self.seq_len = 512 self.head_dim = 32 self.dtype = 'float64' + self.use_mask = False class TestSparseAttentionAPI5(TestSparseAttentionAPI1): @@ -140,6 +156,7 @@ def setUp(self): self.seq_len = 512 self.head_dim = 64 self.dtype = 'float64' + self.use_mask = True if __name__ == '__main__': diff --git a/python/paddle/incubate/sparse/nn/functional/transformer.py b/python/paddle/incubate/sparse/nn/functional/transformer.py index 3429eecccd758..f69714700bf5d 100644 --- a/python/paddle/incubate/sparse/nn/functional/transformer.py +++ b/python/paddle/incubate/sparse/nn/functional/transformer.py @@ -23,8 +23,8 @@ def attention(query, key, value, sparse_mask, - key_padding_mask, - attn_mask, + key_padding_mask=None, + attn_mask=None, name=None): """ Note: @@ -50,10 +50,10 @@ def attention(query, sparse_mask(SparseCsrTensor): The sparse layout in the Attention module. Its dense shape is `[batch_size*num_heads, seq_len, seq_len]` . `nnz` of each batch must be the same. dtype of `crows` and `cols` must be int64, dtype of `values` can be float32 or float64. - key_padding_mask(DenseTensor): The key padding mask tensor in the Attention module. - 2D tensor with shape: [batch_size, seq_len]. dtype can be float32 or float64. - attn_mask(DenseTensor):The attention mask tensor in the Attention module. - 2D tensor with shape: [seq_len, seq_len]. dtype can be float32 or float64. + key_padding_mask(DenseTensor, optional): The key padding mask tensor in the Attention module. + 2D tensor with shape: [batch_size, seq_len]. dtype can be float32 or float64. Default: None. + attn_mask(DenseTensor, optional): The attention mask tensor in the Attention module. + 2D tensor with shape: [seq_len, seq_len]. dtype can be float32 or float64. Default: None. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. From d520029fa85cc28e0a732efce9c0b34a072d0673 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 6 Jul 2022 18:01:45 +0800 Subject: [PATCH 079/250] Merge prepare_data and prepare_phi_data in static graph (#43878) * merge prepare_data and prepare_phi_data in static graph * fix bug * fix data transform bug * fix bug * fix unittest of cinn --- paddle/fluid/framework/operator.cc | 236 ++++++++---------- paddle/fluid/framework/operator.h | 8 - .../operators/cinn/cinn_launch_op_test.cc | 20 +- 3 files changed, 118 insertions(+), 146 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2be93f0dc9178..0a5de2bd3f262 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1622,9 +1622,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, platform::EventRole::kInnerOp); if (run_phi_kernel_) { phi::KernelContext pt_kernel_context; - // Do data transform before building KernelContext - // TODO(zhiqiu): support TransferInplaceVarsBack - PreparePhiData(exec_scope, *pt_kernel_, *kernel_signature_, runtime_ctx); if (enable_cache_runtime_context_ && !need_prepare_phi_data_ && !need_prepare_data_) { impl_ = @@ -2007,15 +2004,15 @@ Scope* OperatorWithKernel::PrepareData( } } - for (auto& var_name_item : Inputs()) { - bool should_skip_input = - no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0; - - std::vector& input_vars = ctx->inputs[var_name_item.first]; - - for (size_t i = 0; i < var_name_item.second.size(); ++i) { - auto& var_name = var_name_item.second[i]; - auto* var = input_vars[i]; + const auto& name_map = Inputs(); + auto prepare_input_data = [&](const std::string& in_name, + std::vector* in_vars, + const phi::TensorArgDef* in_def, + bool should_skip_input) -> void { + auto& name_vec = name_map.at(in_name); + for (size_t i = 0; i < in_vars->size(); ++i) { + const auto& var_name = name_vec[i]; + auto* var = in_vars->at(i); // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { @@ -2046,17 +2043,17 @@ Scope* OperatorWithKernel::PrepareData( new_scope = &scope.NewScope(); } auto* trans_var = new_scope->Var(var_name); - input_vars[i] = trans_var; + in_vars->at(i) = trans_var; auto out = trans_var->GetMutable(); out->Resize(tensor_in->dims()); platform::MatchShapeToLayout( out, tensor_in->layout(), DataLayout::kNHWC); VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , " "but kNHWC layout" - << var_name_item.first << " in Operator " << type_; + << in_name << " in Operator " << type_; } else { - VLOG(7) << "Skip scanning input " << var_name_item.first - << " in Operator " << type_; + VLOG(7) << "Skip scanning input " << in_name << " in Operator " + << type_; } #endif continue; @@ -2066,15 +2063,46 @@ Scope* OperatorWithKernel::PrepareData( continue; } - auto kernel_type_for_var = GetKernelTypeForVar( - var_name_item.first, *tensor_in, expected_kernel_key); + auto kernel_type_for_var = + GetKernelTypeForVar(in_name, *tensor_in, expected_kernel_key); + bool need_trans_dtype = + kernel_type_for_var.data_type_ != expected_kernel_key.data_type_; + bool need_trans_layout = NeedTransformLayout( + kernel_type_for_var.data_layout_, expected_kernel_key.data_layout_); + if (!need_trans_dtype && !need_trans_layout) { + if (!run_phi_kernel_ && + platform::places_are_same_class(kernel_type_for_var.place_, + expected_kernel_key.place_)) { + continue; + } + } - if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) { - continue; + std::unique_ptr new_expected_kernel_key = nullptr; + if (run_phi_kernel_ && in_def->backend != phi::Backend::ALL_BACKEND) { + auto tensor_backend = phi::TransToPhiBackend(tensor_in->place()); + if ((in_def->backend != tensor_backend && + (in_def->backend != phi::Backend::GPUDNN || + tensor_backend != phi::Backend::GPU)) || + tensor_in->place().GetType() == AllocationType::GPUPINNED) { + new_expected_kernel_key = std::make_unique( + expected_kernel_key.data_type_, + phi::TransToPhiPlace(in_def->backend), + expected_kernel_key.data_layout_, + expected_kernel_key.library_type_, + expected_kernel_key.customized_type_value_); + } + } + + if (!need_trans_dtype && !need_trans_layout) { + if (run_phi_kernel_ && new_expected_kernel_key == nullptr) { + continue; + } } VLOG(3) << "Transform Variable " << var_name << " from " - << kernel_type_for_var << " to " << expected_kernel_key; + << kernel_type_for_var << " to " + << (new_expected_kernel_key ? *new_expected_kernel_key + : expected_kernel_key); // In the inference scenerio, the scopes will be reused across the // batches, so the `new_scope` here will result in GPU memroy explosion @@ -2094,13 +2122,22 @@ Scope* OperatorWithKernel::PrepareData( // not do transfer scope caching, and cpu inference performance is not // impacted by test. enable_cache_transfer_scope_ = false; - if (!run_by_executor_ && - (platform::is_gpu_place(kernel_type_for_var.place_) || - platform::is_gpu_place(expected_kernel_key.place_))) { - new_scope = TryCreateTransferScope( - kernel_type_for_var, expected_kernel_key, &scope); - enable_cache_transfer_scope_ = true; + if (!run_by_executor_) { + if (new_expected_kernel_key) { + if ((platform::is_gpu_place(kernel_type_for_var.place_) || + platform::is_gpu_place(new_expected_kernel_key->place_))) { + new_scope = TryCreateTransferScope( + kernel_type_for_var, *new_expected_kernel_key, &scope); + enable_cache_transfer_scope_ = true; + } + } else if ((platform::is_gpu_place(kernel_type_for_var.place_) || + platform::is_gpu_place(expected_kernel_key.place_))) { + new_scope = TryCreateTransferScope( + kernel_type_for_var, expected_kernel_key, &scope); + enable_cache_transfer_scope_ = true; + } } + if (!new_scope) { new_scope = &scope.NewScope(); } @@ -2117,7 +2154,7 @@ Scope* OperatorWithKernel::PrepareData( // Create new var with the same name in transfer scopes auto* trans_var = new_scope->Var(var_name); - input_vars[i] = trans_var; + in_vars->at(i) = trans_var; // Find if inplace exists between input and output // If inplace exists, set the new created var to inplaced output, and @@ -2125,7 +2162,7 @@ Scope* OperatorWithKernel::PrepareData( for (auto& pair : Outputs()) { for (size_t j = 0; j < pair.second.size(); ++j) { if (pair.second[j] == var_name) { - VLOG(4) << "Found inplace between input(" << var_name_item.first + VLOG(4) << "Found inplace between input(" << in_name << ") and output(" << pair.first << "), the variable name is " << var_name; ctx->outputs[pair.first][j] = trans_var; @@ -2136,9 +2173,47 @@ Scope* OperatorWithKernel::PrepareData( // Do transfer Tensor out; - TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); + TransformData(new_expected_kernel_key ? *new_expected_kernel_key + : expected_kernel_key, + kernel_type_for_var, + *tensor_in, + &out); SetTensorToVariable(*var, out, trans_var); } + }; + + if (run_phi_kernel_) { + const auto& input_names = kernel_signature_->input_names; + const auto& input_defs = pt_kernel_->args_def().input_defs(); + PADDLE_ENFORCE_EQ(input_names.size(), + input_defs.size(), + platform::errors::InvalidArgument( + "The size of inputs_args names (%d) must be equal to " + "the size of kernel input_defs (%d).", + input_names.size(), + input_defs.size())); + for (size_t i = 0; i < input_defs.size(); ++i) { + const auto& input_defs = pt_kernel_->args_def().input_defs(); + auto& in_def = input_defs.at(i); + std::string input_name = input_names[i]; + auto iter = ctx->inputs.find(input_name); + if (iter == ctx->inputs.end()) { + continue; + } + auto& ins_vector = iter->second; + bool should_skip_input = + no_buffer_ins && no_buffer_ins->count(input_name) > 0; + prepare_input_data(input_name, &ins_vector, &in_def, should_skip_input); + } + } else { + for (auto& var_name_item : Inputs()) { + bool should_skip_input = + no_buffer_ins && no_buffer_ins->count(var_name_item.first) > 0; + + std::vector& input_vars = ctx->inputs[var_name_item.first]; + prepare_input_data( + var_name_item.first, &input_vars, nullptr, should_skip_input); + } } // If pre_scope = &scope, it means that scope is cached and the op is not in @@ -2381,107 +2456,6 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( return (*arg_map_fn_)(arg_mapping_ctx); } -Scope* OperatorWithKernel::PreparePhiData( - const Scope& scope, - const phi::Kernel& pt_kernel, - const phi::KernelSignature& pt_kernel_signature, - RuntimeContext* ctx) const { - const auto& input_names = pt_kernel_signature.input_names; - auto input_defs = pt_kernel.args_def().input_defs(); - PADDLE_ENFORCE_EQ(input_names.size(), - input_defs.size(), - platform::errors::InvalidArgument( - "The size of inputs_args names (%d) must be equal to " - "the size of kernel input_defs (%d).", - input_names.size(), - input_defs.size())); - Scope* new_scope = nullptr; - auto& name_map = Inputs(); - const std::unordered_set* no_buffer_ins = nullptr; - if (info_) { - auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer(); - // Some op may not register NoNeedBufferVarsInferer - if (no_buffer_inferer) { - no_buffer_ins = &(no_buffer_inferer(Inputs(), Outputs(), Attrs())); - if (no_buffer_ins->empty()) no_buffer_ins = nullptr; - } - } - - for (size_t i = 0; i < input_defs.size(); ++i) { - auto& in_def = input_defs.at(i); - if (ctx->inputs.find(input_names[i]) == ctx->inputs.end()) { - continue; - } - auto& ins_vector = ctx->inputs.at(input_names[i]); - auto& name_vec = name_map.at(input_names[i]); - bool should_skip_input = - no_buffer_ins && no_buffer_ins->count(input_names[i]) > 0; - - for (size_t offset = 0; offset < ins_vector.size(); ++offset) { - // Only tensor can be tranfer to another device. - auto* var = ins_vector[offset]; - if (var == nullptr || !VarIsTensor(*var)) { - continue; - } - auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var); - - // When no_buffer_ins then checking of Tensor::holder_ is - // not a thread safe. And for infershape scenario checks - // to be omitted are not really needed - if (should_skip_input == true) { - // TODO(YuanRisheng) : There need to supplement MKLDNN code later - continue; - } - - if (!tensor_in->IsInitialized()) { - continue; - } - - if (in_def.backend == phi::Backend::ALL_BACKEND) { - continue; - } - - auto tensor_backend = phi::TransToPhiBackend(tensor_in->place()); - if (in_def.backend == tensor_backend || - (in_def.backend == phi::Backend::GPUDNN && - tensor_backend == phi::Backend::GPU)) { - continue; - } - - auto expected_place = phi::TransToPhiPlace(in_def.backend); - VLOG(3) << "phi Transform Variable " << input_names[i] << " from " - << tensor_in->place() << " to " << expected_place; - - if (!new_scope) { - new_scope = &scope.NewScope(); - } - // For inference, if a gpu model has an op which could only run on CPU, - // each result of different input will be the same with the first one. - // The reason is that if a gpu tensor is the input of a cpu kernel, - // we will create a new cpu tensor in new scope. - // However, if enable_cache_runtime_context_, we get the cpu tensor each - // time, not the gpu tensor. Thus, we set pre_scope_ = nullptr - // to trigger `new RuntimeContext()` in RunImpl(). - if (enable_cache_runtime_context_) { - pre_scope_ = nullptr; - } - - // Create new var with the same name in transfer scopes - auto* trans_var = new_scope->Var(name_vec[offset]); - ins_vector[offset] = trans_var; - - // Do transfer - Tensor out; - framework::TensorCopySync(*tensor_in, expected_place, &out); - SetTensorToVariable(*var, out, trans_var); - - need_prepare_phi_data_ = true; - } - } - - return new_scope; -} - void OperatorWithKernel::BuildPhiKernelContext( const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 1b7bd433dd104..c3827f56c7197 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -646,14 +646,6 @@ class OperatorWithKernel : public OperatorBase { phi::KernelKey ChoosePhiKernel(const ExecutionContext& ctx) const; void ChooseKernel(const ExecutionContext& ctx) const; - /** - * Transfer data place for phi kernel - * Is this really needed? - */ - Scope* PreparePhiData(const Scope& scope, - const phi::Kernel& pt_kernel, - const phi::KernelSignature& pt_kernel_signature, - RuntimeContext* ctx) const; void BuildPhiKernelContext(const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index 674f55efb5feb..5b965573deefa 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -89,31 +89,37 @@ class TestCinnLaunchOp : public ::testing::Test { void TearDown() override { CinnCompiler::GetInstance()->Clear(); } }; -TEST_F(TestCinnLaunchOp, TestRunInstructionByPE) { - // CPU +TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByPE) { RunAndCheck(platform::CPUPlace()); // the second run on the same place is to check the cache logic RunAndCheck(platform::CPUPlace()); +} + #ifdef PADDLE_WITH_CUDA - // GPU +TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByPE) { RunAndCheck(platform::CUDAPlace()); RunAndCheck(platform::CUDAPlace()); -#endif } +#endif -TEST_F(TestCinnLaunchOp, TestRunInstructionByCinnProgram) { +TEST_F(TestCinnLaunchOp, TestRunCPUInstructionByCinnProgram) { // set FLAGS_enable_pe_launch_cinn=false to switch to use // default scheduler of CINN to execute the compiled program FLAGS_enable_pe_launch_cinn = false; RunAndCheck(platform::CPUPlace()); RunAndCheck(platform::CPUPlace()); +} + #ifdef PADDLE_WITH_CUDA - // GPU +TEST_F(TestCinnLaunchOp, TestRunGPUInstructionByCinnProgram) { + // set FLAGS_enable_pe_launch_cinn=false to switch to use + // default scheduler of CINN to execute the compiled program + FLAGS_enable_pe_launch_cinn = false; RunAndCheck(platform::CUDAPlace()); RunAndCheck(platform::CUDAPlace()); -#endif } +#endif TEST_F(TestCinnLaunchOp, TestRunWithAutoTuneEnabled) { FLAGS_enable_cinn_auto_tune = true; From 3fd6f09fb413e56d067cd0ec5097546ddf0e0ec9 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Wed, 6 Jul 2022 18:45:44 +0800 Subject: [PATCH 080/250] [Paddle-TRT] support inpus is weight (#44051) * support inpus is weight --- .../inference/tensorrt/convert/op_converter.h | 165 ++++++++++++++---- paddle/fluid/inference/tensorrt/engine.h | 9 + .../tensorrt/tensorrt_engine_op_test.cc | 1 - 3 files changed, 140 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index d179e8bb34c16..8bcc926b856e2 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -230,10 +230,54 @@ class OpConverter { const framework::Scope& scope, TensorRTEngine* engine) { std::unique_lock lk(mut_); + for (int i = 0; i < block.ops_size(); i++) { + SetEngine(engine); + const auto& op = block.ops(i); + framework::OpDesc op_desc(op, nullptr); + framework::Variable* X_v = nullptr; + std::string X_name; + // inputs : string -> std::vector + auto inputs = op_desc.Inputs(); + if (inputs.count("X")) { + X_name = op_desc.Input("X")[0]; + } else if (inputs.count("Input")) { + X_name = op_desc.Input("Input")[0]; + } else if (inputs.count("Y")) { + X_name = op_desc.Input("Y")[0]; + } + X_v = scope.FindVar(X_name); + // If this weight is shared between ops, it needn't to be convtered to + // itensor once again + if (engine->GetITensorMap()->count(X_name)) { + continue; + } + if (X_v) { + ConvertWeight2ITensor(scope, X_name); + } + } for (int i = 0; i < block.ops_size(); i++) { const auto& op = block.ops(i); ConvertOp(op, parameters, scope, engine); } + for (int i = 0; i < engine->network()->getNbLayers(); i++) { + auto layer = engine->network()->getLayer(i); + if (layer->getType() == nvinfer1::LayerType::kSHUFFLE) { + auto* input_tensor = layer->getInput(0); + auto* output_tensor = layer->getOutput(0); + auto output_tensor_name = output_tensor->getName(); + auto input_tensor_name = input_tensor->getName(); + if (engine->DynamicRangeIsSet(input_tensor) && + !engine->DynamicRangeIsSet(output_tensor)) { + float output_scale = engine->GetTensorDynamicRange(input_tensor); + VLOG(1) << "Set output tensor scale = " << output_scale + << " for tensor in TensorRT: " << output_tensor_name << "."; + engine->SetTensorDynamicRange(output_tensor, output_scale); + } else { + VLOG(1) << "Failed to get input tensor scale for tensor in TensorRT: " + << input_tensor_name << "."; + } + } + } } // The scope here should be inited with the parameter vars. @@ -273,8 +317,8 @@ class OpConverter { continue; } std::vector input_shape; - input_shape.push_back(-1); - for (size_t i = 1; i < ranks; i++) { + // input_shape.push_back(-1); + for (size_t i = 0; i < ranks; i++) { if (min_input_shape[i] != max_input_shape[i]) { input_shape.push_back(-1); } else { @@ -402,6 +446,17 @@ class OpConverter { return c; } + nvinfer1::ITensor* FloorDiv(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *a, + *b, + nvinfer1::ElementWiseOperation::kFLOOR_DIV) + ->getOutput(0); + return c; + } + nvinfer1::ITensor* Act(nvinfer1::ITensor* a, nvinfer1::ActivationType act_type) { nvinfer1::ITensor* c = @@ -422,22 +477,27 @@ class OpConverter { ->getOutput(0); return tensor; } - - // Create and add Multi-D constant float layer - nvinfer1::ITensor* AddConstantLayer(const float* data, + template + // Create and add Multi-D constant float/int32 layer + nvinfer1::ITensor* AddConstantLayer(const T* data, const std::vector& weight_dims, const std::string& weight_name) { - std::unique_ptr tmp_tensor(new framework::Tensor()); int data_size = std::accumulate( weight_dims.begin(), weight_dims.end(), 1, std::multiplies()); + std::unique_ptr tmp_tensor(new framework::Tensor()); tmp_tensor->Resize({data_size}); - auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); for (int i = 0; i < data_size; i++) { tmp_data[i] = data[i]; } engine_->SetWeights(weight_name, std::move(tmp_tensor)); - TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT; + if (std::is_integral::value) { + trt_dtype = nvinfer1::DataType::kINT32; + } + + TensorRTEngine::Weight weight{trt_dtype, static_cast(tmp_data), static_cast(data_size)}; nvinfer1::Dims trt_dims; @@ -449,44 +509,26 @@ class OpConverter { return const_layer->getOutput(0); } - // Create and add 1D constant float layer - nvinfer1::ITensor* Add1DConstantLayer(const std::vector& data, + // Create and add 1D constant float/int32 layer + template + nvinfer1::ITensor* Add1DConstantLayer(const std::vector& data, const std::string& weight_name = "", bool scalar = false) { std::unique_ptr tmp_tensor(new framework::Tensor()); int data_size = data.size(); tmp_tensor->Resize({data_size}); - auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); for (int i = 0; i < data_size; i++) { tmp_data[i] = data[i]; } engine_->SetWeights(weight_name, std::move(tmp_tensor)); - TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, - static_cast(tmp_data), - static_cast(data_size)}; - nvinfer1::Dims input_shape; - input_shape.nbDims = scalar ? 0 : 1; - input_shape.d[0] = data_size; - auto const_layer = - TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get()); - return const_layer->getOutput(0); - } - - // Create and add 1D constant layer - nvinfer1::ITensor* Add1DConstantLayer(const std::vector& data, - const std::string& weight_name = "", - bool scalar = false) { - std::unique_ptr tmp_tensor(new framework::Tensor()); - int data_size = data.size(); - tmp_tensor->Resize({data_size}); - auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); - for (int i = 0; i < data_size; i++) { - tmp_data[i] = data[i]; + nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT; + if (std::is_integral::value) { + trt_dtype = nvinfer1::DataType::kINT32; } - engine_->SetWeights(weight_name, std::move(tmp_tensor)); - TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32, + TensorRTEngine::Weight weight{trt_dtype, static_cast(tmp_data), static_cast(data_size)}; nvinfer1::Dims input_shape; @@ -513,6 +555,61 @@ class OpConverter { return Add1DConstantLayer(tmp_data, weight_name, scalar); } + // For cases when input is not middle-tensor , but persistable tensor + // you should call this. + nvinfer1::ITensor* ConvertWeight2ITensor(const framework::Scope& scope, + const std::string& name) { + auto* var_v = scope.FindVar(name); + auto* var_t = var_v->GetMutable(); + void* trt_ptr = nullptr; + size_t trt_num = static_cast(var_t->numel()); + nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT; + if (var_t->dtype() == phi::DataType::FLOAT32) { + float* data_ptr = engine_->GetWeightCPUData(name, var_t); + trt_ptr = static_cast(data_ptr); + } else if (var_t->dtype() == phi::DataType::INT32) { + int32_t* data_ptr = engine_->GetWeightCPUData(name, var_t); + trt_ptr = static_cast(data_ptr); + trt_dtype = nvinfer1::DataType::kINT32; + } else if (var_t->dtype() == phi::DataType::INT64) { + int64_t* data_ptr = engine_->GetWeightCPUData(name, var_t); + // We must create a new framework::Tensor() + std::unique_ptr new_var_t(new framework::Tensor()); + new_var_t->Resize({var_t->numel()}); + int32_t* new_data_ptr = + new_var_t->mutable_data(platform::CPUPlace()); + for (size_t i = 0; i < trt_num; i++) { + new_data_ptr[i] = data_ptr[i]; + } + engine_->SetWeights(name, std::move(new_var_t)); + trt_ptr = static_cast(new_data_ptr); + trt_dtype = nvinfer1::DataType::kINT32; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported datatype in TensorRT")); + } + // Now we have create weights, then we need create a itensor + auto var_dims = var_t->dims(); + nvinfer1::Dims trt_in_shape; + trt_in_shape.nbDims = var_t->dims().size(); + for (int64_t i = 0; i < trt_in_shape.nbDims; i++) { + trt_in_shape.d[i] = var_dims[i]; + } + // In fact , this is not always right, because we can't determine if the 0th + // dimension is batch. Just for run chenqu's model + if (!engine_->with_dynamic_shape()) { + trt_in_shape.nbDims--; + for (int i = 0; i < trt_in_shape.nbDims; i++) { + trt_in_shape.d[i] = trt_in_shape.d[i + 1]; + } + } + TensorRTEngine::Weight weight{trt_dtype, trt_ptr, trt_num}; + nvinfer1::ILayer* layer = + TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_in_shape, weight.get()); + engine_->SetITensor(name, layer->getOutput(0)); + return layer->getOutput(0); + } + void RreplenishLayerAndOutput( nvinfer1::ILayer* layer, const std::string& layer_type, diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index c75f7dd17cb95..5c2bb6e0ca07f 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -406,6 +406,15 @@ class TensorRTEngine { void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) { quant_dynamic_range_[tensor] = range; } + + float GetTensorDynamicRange(nvinfer1::ITensor* tensor) { + return quant_dynamic_range_[tensor]; + } + + bool DynamicRangeIsSet(nvinfer1::ITensor* tensor) { + return quant_dynamic_range_.count(tensor); + } + template T* GetWeightCPUData(const std::string& name, framework::Tensor* weight_tensor); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 08a71ad713a1b..cbe14195d4106 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -150,7 +150,6 @@ void DynamicShapeTest(bool allow_build_at_runtime) { else CreateCUDATensor(&scope, "x", std::vector({2, 4, 1, 1})); CreateCUDATensor(&scope, "y", std::vector({4, 6})); - CreateCUDATensor(&scope, "z", std::vector({2, 6})); CreateCUDATensor(&scope, "y0", std::vector({6, 8})); CreateCUDATensor(&scope, "z0", std::vector({2, 8})); From aa18ae11f21a957584f3125b9fb5e4deba7a3827 Mon Sep 17 00:00:00 2001 From: "joanna.wozna.intel" Date: Wed, 6 Jul 2022 13:17:47 +0200 Subject: [PATCH 081/250] Set FC input data format to ANY (#44023) * Fc add any to input format * Pre-commit changes --- paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 79551b6d59a2c..8cfbc95be7a1a 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -209,7 +209,7 @@ class FCPrimitiveFactory { const Tensor* bias, LoDTensor* output, const ExecutionContext& ctx) { - auto src_desc = CreateMemDescriptor(input, input->format()); + auto src_desc = CreateMemDescriptor(input, MKLDNNMemoryFormat::any); auto weight_dims = Get2DWeightDimsForDNNL(weights); auto weights_desc = CreateMemDescriptor(weight_dims, MKLDNNMemoryFormat::any); @@ -236,7 +236,8 @@ class FCPrimitiveFactory { auto input_dims = phi::vectorize(input->dims()); std::vector new_input_dims = { input_dims[0] * input_dims[1], input_dims[2], 1}; - auto src_desc = CreateMemDescriptor(new_input_dims, input->format()); + auto src_desc = + CreateMemDescriptor(new_input_dims, MKLDNNMemoryFormat::any); auto weight_dims = Get3DWeightDimsForDNNL(weights); auto weights_desc = @@ -267,7 +268,7 @@ class FCPrimitiveFactory { const Tensor* bias, LoDTensor* output, const ExecutionContext& ctx) { - auto src_desc = CreateMemDescriptor(input, input->format()); + auto src_desc = CreateMemDescriptor(input, MKLDNNMemoryFormat::any); // Since MKL-DNN doesn't support 4D column-major data formats in // inner_product primitive, transpose the weights to be in // row-major format From bbe995556130f7cd77241df999b2eb0cebd4a146 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Wed, 6 Jul 2022 19:26:57 +0800 Subject: [PATCH 082/250] make inference_c test linking only paddle_inference_c (#44126) --- paddle/fluid/inference/capi/CMakeLists.txt | 4 ++ .../fluid/inference/capi_exp/CMakeLists.txt | 4 ++ .../fluid/inference/tests/api/CMakeLists.txt | 60 +++---------------- 3 files changed, 16 insertions(+), 52 deletions(-) diff --git a/paddle/fluid/inference/capi/CMakeLists.txt b/paddle/fluid/inference/capi/CMakeLists.txt index 25d8a39dc6374..73ba41607aae8 100644 --- a/paddle/fluid/inference/capi/CMakeLists.txt +++ b/paddle/fluid/inference/capi/CMakeLists.txt @@ -20,6 +20,10 @@ cc_library( SRCS ${C_API_SRCS} DEPS paddle_inference) +if(NOT ON_INFER) + return() +endif() + # Create inference capi shared library cc_library( paddle_inference_c_shared SHARED diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt index 56de57cbb9c85..e35e14a0c0241 100644 --- a/paddle/fluid/inference/capi_exp/CMakeLists.txt +++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt @@ -20,6 +20,10 @@ cc_library( SRCS ${C_API_SRCS} DEPS paddle_inference) +if(NOT ON_INFER) + return() +endif() + # Create inference capi shared library cc_library( paddle_inference_c_shared SHARED diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 610883ad1ad27..1ed41417355ce 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -943,28 +943,17 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS analyzer_capi_exp_gpu_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - if(WIN32) - target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c_shared) - else() - target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c) - endif() inference_analysis_test( test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - if(WIN32) - target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c_shared) - else() - target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c) - endif() - set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") if(NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz) @@ -1110,44 +1099,27 @@ inference_analysis_test( SRCS analyzer_capi_exp_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_c ARGS --infer_model=${RESNET50_MODEL_DIR}/model) -if(WIN32) - target_link_libraries(test_analyzer_capi_exp paddle_inference_c_shared) -else() - target_link_libraries(test_analyzer_capi_exp paddle_inference_c) -endif() inference_analysis_test( test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_c ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) -if(WIN32) - target_link_libraries(test_analyzer_capi_exp_pd_config - paddle_inference_c_shared) -else() - target_link_libraries(test_analyzer_capi_exp_pd_config paddle_inference_c) -endif() inference_analysis_test( test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_c ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) -if(WIN32) - target_link_libraries(test_analyzer_capi_exp_pd_tensor - paddle_inference_c_shared) -else() - target_link_libraries(test_analyzer_capi_exp_pd_tensor paddle_inference_c) -endif() if(NOT APPLE AND NOT WIN32) inference_analysis_test( @@ -1155,15 +1127,9 @@ if(NOT APPLE AND NOT WIN32) SRCS analyzer_capi_exp_pd_threads_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_c ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) - if(WIN32) - target_link_libraries(test_analyzer_capi_exp_pd_threads - paddle_inference_c_shared) - else() - target_link_libraries(test_analyzer_capi_exp_pd_threads paddle_inference_c) - endif() endif() inference_analysis_test( @@ -1205,14 +1171,9 @@ if(WITH_MKLDNN) SRCS analyzer_capi_exp_int_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_c ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model) - if(WIN32) - target_link_libraries(test_analyzer_capi_exp_int paddle_inference_c_shared) - else() - target_link_libraries(test_analyzer_capi_exp_int paddle_inference_c) - endif() endif() inference_analysis_test( @@ -1220,14 +1181,9 @@ inference_analysis_test( SRCS analyzer_capi_exp_ner_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_c ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model) -if(WIN32) - target_link_libraries(test_analyzer_capi_exp_ner paddle_inference_c_shared) -else() - target_link_libraries(test_analyzer_capi_exp_ner paddle_inference_c) -endif() if(WITH_GPU) inference_analysis_test( From b603dd55b8a4fa9ce0cfcff72ec46082cf0df9f9 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Wed, 6 Jul 2022 20:20:44 +0800 Subject: [PATCH 083/250] [Dy2static] FunctionScopeVisitor Enhance and substitute the original NameVisitor in If (#43967) * add support for control flow block analysis * move FunctionNameLivenessAnalysis into utils * pass test_ifelse.py * remove duplicate data_layer_not_check * pass the test_ifelse.py * fix unittest error . * fix all ci error in first version * temporay disable CreateVariableTransformer * fix ci errors * fix function name liveness analysis bugs * modifty def cond * fix * fix ci error - v2 * fix by code review * change return_name_ids -> return_name --- .../dygraph_to_static/ast_transformer.py | 1 + .../dygraph_to_static/convert_operators.py | 60 ++-- .../dygraph_to_static/ifelse_transformer.py | 223 +----------- .../dygraph_to_static/loop_transformer.py | 101 +----- .../dygraph_to_static/return_transformer.py | 108 +----- .../fluid/dygraph/dygraph_to_static/utils.py | 326 +++++++++++++++--- .../dygraph_to_static/variable_trans_func.py | 9 +- python/paddle/fluid/layers/control_flow.py | 103 +++++- python/paddle/fluid/layers/utils.py | 27 ++ .../dygraph_to_static/ifelse_simple_func.py | 4 +- .../seq2seq_dygraph_model.py | 3 +- .../dygraph_to_static/test_break_continue.py | 26 +- .../test_closure_analysis.py | 38 +- .../dygraph_to_static/test_ifelse.py | 27 +- .../dygraph_to_static/test_ifelse_basic.py | 263 +------------- .../test_program_translator.py | 69 ++-- .../dygraph_to_static/test_return.py | 14 +- .../transformer_dygraph_model.py | 3 +- .../paddle/fluid/tests/unittests/test_cond.py | 25 +- 19 files changed, 629 insertions(+), 801 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py index ab4133099eaf3..f1ab097758b71 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py @@ -96,6 +96,7 @@ def transfer_from_node_type(self, node_wrapper): BreakContinueTransformer, # break/continue in loops ReturnTransformer, # return in functions LogicalTransformer, # logical and/or/not + #CreateVariableTransformer, # create undefined var for if / while / for LoopTransformer, # for/while -> while_op IfElseTransformer, # if/else -> cond_op AssertTransformer, # assert statement diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py index c0c679e2e1ef0..583db5c0dcdba 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re + from paddle.fluid.data_feeder import convert_dtype from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable from paddle.fluid.framework import core, Variable @@ -21,7 +23,7 @@ from paddle.fluid.layers import cast, control_flow, logical_and, logical_not, logical_or, nn from paddle.fluid.layers.control_flow import cond, while_loop, less_than, increment from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_VAR_NAME -from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar +from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, Dygraph2StaticException def convert_while_loop(cond, body, getter, setter): @@ -41,11 +43,9 @@ def convert_while_loop(cond, body, getter, setter): # If loop_vars is changed during cond callable, then it causes bug, but current logical_and/logical_not/... doesn't change the loop_vars. pred = cond() if isinstance(pred, Variable): - loop_vars = _run_paddle_while(cond, body, getter, setter) + _run_paddle_while(cond, body, getter, setter) else: - loop_vars = _run_py_while(cond, body, getter, setter) - - return loop_vars + _run_py_while(cond, body, getter, setter) def _run_paddle_while(cond, body, getter, setter): @@ -61,10 +61,13 @@ def _run_paddle_while(cond, body, getter, setter): def _run_py_while(cond, body, getter, setter): - loop_vars = getter() - while cond(): - loop_vars = body() - return loop_vars + while True: + pred = cond() + if isinstance(pred, Variable): + raise Dygraph2StaticException( + "python while pred change from bool to variable.") + if not pred: break + body() def convert_logical_and(x_func, y_func): @@ -231,17 +234,32 @@ def _run_paddle_cond(pred, true_fn, false_fn, get_args, set_args, def new_true_fn(): set_args(init_args) - outs = true_fn() - _check_no_undefined_var(outs, return_name_ids, 'if_body') - return outs + ret = true_fn() + # IfExpr will return a non-None return value, so we just return ret. + # We assume normal return has no return value. + if ret is None: return get_args() + else: return ret def new_false_fn(): set_args(init_args) - outs = false_fn() - _check_no_undefined_var(outs, return_name_ids, 'else_body') - return outs - - cond_outs = control_flow.cond(pred, new_true_fn, new_false_fn) + ret = false_fn() + if ret is None: return get_args() + else: return ret + + try: + cond_outs = control_flow.cond(pred, new_true_fn, new_false_fn, None, + return_name_ids) + except Exception as e: + if re.search("Unsupported return type of true_fn and false_fn in cond", + str(e)): + raise Dygraph2StaticException( + "Your if/else have different return type. TODO: add link to modifty. {}" + .format(str(e))) + if re.search("Incompatible return values of", str(e)): + raise Dygraph2StaticException( + "Your if/else have different number of return value. TODO: add link to modifty. {}" + .format(str(e))) + raise e return _recover_args_state(cond_outs, get_args, set_args, return_name_ids) @@ -251,8 +269,7 @@ def _run_py_ifelse(pred, true_fn, false_fn, get_args, set_args, Evaluate python original branch function if-else. """ py_outs = true_fn() if pred else false_fn() - py_outs = _remove_no_value_return_var(py_outs) - return _recover_args_state(py_outs, get_args, set_args, return_name_ids) + return py_outs def _remove_no_value_return_var(out): @@ -317,9 +334,10 @@ def _recover_args_state(outs, get_args, set_args, return_name_ids): assert num_outs <= num_args if num_args == 1: - final_outs = (outs, ) + final_outs = (outs, ) if not isinstance(outs, + (list, tuple)) else tuple(outs) else: - outs = (outs, ) if num_outs == 1 else outs + outs = (outs, ) if num_outs == 1 else tuple(outs) final_outs = outs + init_args[num_outs:] set_args(final_outs) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py index 13ac63f91057f..a65e86f8e82fd 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py @@ -27,11 +27,11 @@ from paddle.fluid import unique_name from paddle.fluid.dygraph.dygraph_to_static.utils import create_funcDef_node, ast_to_source_code -from paddle.fluid.dygraph.dygraph_to_static.utils import create_assign_node +from paddle.fluid.dygraph.dygraph_to_static.utils import create_assign_node, FunctionNameLivenessAnalysis from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var -from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_node +from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_nodes from paddle.fluid.dygraph.dygraph_to_static.utils import create_get_args_node, create_set_args_node from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer @@ -53,7 +53,8 @@ def __init__(self, wrapper_root): ), "Type of input node should be AstNodeWrapper, but received %s ." % type( wrapper_root) self.root = wrapper_root.node - self.static_analysis_visitor = StaticAnalysisVisitor(self.root) + FunctionNameLivenessAnalysis( + self.root) # name analysis of current ast tree. def transform(self): """ @@ -273,193 +274,6 @@ def _update_name_ids(self, new_name_ids): self.name_ids[name_id] = ctxs + self.name_ids[name_id] -def get_name_ids(nodes, after_node=None, end_node=None): - """ - Return all ast.Name.id of python variable in nodes range from - (after_node, end_node) exclusively. If after_node or end_node is None, the - range is unlimited. - """ - name_visitor = NameVisitor(after_node, end_node) - for node in nodes: - name_visitor.visit(node) - return name_visitor.name_ids - - -def parse_cond_args(parent_ids, - var_ids_dict, - modified_ids_dict=None, - ctx=gast.Load): - """ - Find out the ast.Name.id list of input by analyzing node's AST information. - """ - - # 1. filter the var fit the ctx - arg_name_ids = [ - var_id for var_id, var_ctx in six.iteritems(var_ids_dict) - if isinstance(var_ctx[0], ctx) - ] - - # 2. args should contain modified var ids in if-body or else-body - # case: - # - # ``` - # if b < 1: - # z = y - # else: - # z = x - # ``` - # - # In the above case, `z` should be in the args of cond() - if modified_ids_dict: - arg_name_ids = set(arg_name_ids) | set(modified_ids_dict) - - # 3. args should not contain the vars not in parent ids - # case : - # - # ``` - # x = 1 - # if x > y: - # z = [v for v in range(i)] - # ``` - # - # In the above case, `v` should not be in the args of cond() - arg_name_ids = set(arg_name_ids) & set(parent_ids) - - return arg_name_ids - - -def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict, - after_ifelse_vars_dict): - """ - Find out the ast.Name list of output by analyzing node's AST information. - One of the following conditions should be satisfied while determining whether a variable is a return value: - 1. the var in parent scope is modified in If.body or If.orelse node. - 2. new var is both created in If.body and If.orelse node. - 3. new var is created only in one of If.body or If.orelse node, and it used as gast.Load firstly after gast.If node. - - For example: - x, y = 5, 10 - if x > 4: - x = x+1 - z = x*x - q = 10 - else: - y = y - 1 - z = y*y - m = 20 - n = 20 - - print(q) - n = 30 - print(n) - - - The return_ids are (x, y, z, q) for `If.body` and `If.orelse`node, because - 1. x is modified in If.body node, - 2. y is modified in If.body node, - 3. z is both created in If.body and If.orelse node, - 4. q is created only in If.body, and it is used by `print(q)` as gast.Load. - Note: - After transformed, q and z are created in parent scope. For example, - - x, y = 5, 10 - q = paddle.jit.dy2static.UndefindVar('q') - z = paddle.jit.dy2static.UndefindVar('z') - - def true_func(x, y, q): - x = x+1 - z = x*x - q = 10 - return x,y,z,q - - def false_func(x, y, q): - y = y - 1 - z = y*y - m = 20 - n = 20 - return x,y,z,q - - x,y,z,q = fluid.layers.cond(x>4, lambda: true_func(x, y), lambda: false_func(x, y, q)) - - m and n are not in return_ids, because - 5. m is created only in If.orelse, but it is not used after gast.If node. - 6. n is created only in If.orelse, and it is used by `n = 30` and `print(n)`, but it is not used as gast.Load firstly but gast.Store . - - """ - - def _is_return_var(ctxs): - for ctx in ctxs: - if isinstance(ctx, (gast.Store, gast.Param)): - return True - return False - - def _vars_with_store(ids_dict): - vars = [] - for k, ctxs in six.iteritems(ids_dict): - if _is_return_var(ctxs): - vars.append(k) - return vars - - def _modified_vars(child_dict, parent_dict): - return set( - [var for var in _vars_with_store(child_dict) if var in parent_dict]) - - def _vars_loaded(ids_dict): - """ - gast.Param is also a kind of `load` semantic. - """ - new_dict = defaultdict(list) - for k, ctxs in six.iteritems(ids_dict): - for ctx in ctxs: - if isinstance(ctx, (gast.Load, gast.Param)): - new_dict[k].append(ctx) - return new_dict - - # modified vars - body_modified_vars = _modified_vars(if_vars_dict, parent_vars_dict) - body_modified_vars = set( - filter(lambda x: x != ARGS_NAME, body_modified_vars)) - orelse_modified_vars = _modified_vars(else_vars_dict, parent_vars_dict) - orelse_modified_vars = set( - filter(lambda x: x != ARGS_NAME, orelse_modified_vars)) - modified_vars = body_modified_vars | orelse_modified_vars - - # new vars - # TODO(remove __args when new FunctionScopeAnalysis has been used.) - body_new_vars = set([ - var for var in _vars_with_store(if_vars_dict) - if var not in parent_vars_dict and var != ARGS_NAME - ]) - orelse_new_vars = set([ - var for var in _vars_with_store(else_vars_dict) - if var not in parent_vars_dict and var != ARGS_NAME - ]) - new_vars_in_body_or_orelse = body_new_vars | orelse_new_vars - new_vars_in_one_of_body_or_orelse = body_new_vars ^ orelse_new_vars - - # 1. the var in parent scope is modified in If.body or If.orelse node. - modified_vars_from_parent = modified_vars - new_vars_in_body_or_orelse - - # 2. new var is both created in If.body and If.orelse node. - new_vars_in_body_and_orelse = body_new_vars & orelse_new_vars - - # 3. new var is created only in one of If.body or If.orelse node, and it used as gast.Load firstly after gast.If node. - # TODO(zhhsplendid): the _vars_loaded can be optimized as _vars_loaded_before_store. Because if a variable is stored before load, - # the value would change by the store statement, we don't have to return to change the value. However, analysis is - # complex because if the IfElse is nested and outer IfElse store statement may not run at all. We will put this optimization - # as the future TODO - used_vars_after_ifelse = set( - [var for var in _vars_loaded(after_ifelse_vars_dict)]) - new_vars_to_create = new_vars_in_one_of_body_or_orelse & used_vars_after_ifelse | new_vars_in_body_and_orelse - - # 4. generate return_ids of if/else node. - return_ids = list(modified_vars_from_parent | new_vars_in_body_and_orelse - | new_vars_to_create) - return_ids.sort() - - return return_ids, modified_vars_from_parent, new_vars_to_create - - def _valid_nonlocal_names(return_name_ids, nonlocal_names): """ All var in return_name_ids should be in nonlocal_names. @@ -490,15 +304,8 @@ def transform_if_else(node, root): """ # TODO(liym27): Consider variable like `self.a` modified in if/else node. - parent_name_ids = get_name_ids([root], end_node=node) - body_name_ids = get_name_ids(node.body) - orelse_name_ids = get_name_ids(node.orelse) - # Get after_ifelse_name_ids, which means used var names after If.body and If.orelse node. - after_ifelse_name_ids = get_name_ids([root], after_node=node) - - return_name_ids, modified_name_ids_from_parent, new_vars_to_create = parse_cond_return( - parent_name_ids, body_name_ids, orelse_name_ids, after_ifelse_name_ids) - + new_vars_to_create = sorted(list(node.pd_scope.created_vars())) + return_name_ids = sorted(list(node.pd_scope.modified_vars())) # NOTE: Python can create variable only in if body or only in else body, and use it out of if/else. # E.g. # @@ -513,16 +320,7 @@ def transform_if_else(node, root): if "." not in name: create_new_vars_in_parent_stmts.append(create_undefined_var(name)) - parent_ids_set = set() - for k, ctxs in parent_name_ids.items(): - if any([not isinstance(ctx, gast.Load) for ctx in ctxs]): - parent_ids_set.add(k) - - true_args = parse_cond_args(parent_ids_set, body_name_ids, - modified_name_ids_from_parent) - false_args = parse_cond_args(parent_ids_set, orelse_name_ids, - modified_name_ids_from_parent) - nonlocal_names = list(true_args | false_args | new_vars_to_create) + nonlocal_names = list(return_name_ids) nonlocal_names.sort() # NOTE: All var in return_name_ids should be in nonlocal_names. nonlocal_names = _valid_nonlocal_names(return_name_ids, nonlocal_names) @@ -531,8 +329,7 @@ def transform_if_else(node, root): if ARGS_NAME in nonlocal_names: nonlocal_names.remove(ARGS_NAME) - nonlocal_stmt_node = [create_nonlocal_stmt_node(nonlocal_names) - ] if nonlocal_names else [] + nonlocal_stmt_node = create_nonlocal_stmt_nodes(nonlocal_names) empty_arg_node = gast.arguments(args=[], posonlyargs=[], @@ -546,12 +343,12 @@ def transform_if_else(node, root): nonlocal_stmt_node + node.body, name=unique_name.generate(TRUE_FUNC_PREFIX), input_args=empty_arg_node, - return_name_ids=return_name_ids) + return_name_ids=[]) false_func_node = create_funcDef_node( nonlocal_stmt_node + node.orelse, name=unique_name.generate(FALSE_FUNC_PREFIX), input_args=empty_arg_node, - return_name_ids=return_name_ids) + return_name_ids=[]) get_args_node = create_get_args_node(nonlocal_names) set_args_node = create_set_args_node(nonlocal_names) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index 0485e5abbdf96..f04161f2c34cc 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -30,7 +30,7 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import RenameTransformer from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node -from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_node, create_get_args_node, create_set_args_node +from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_nodes, create_get_args_node, create_set_args_node from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import ARGS_NAME from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer @@ -93,101 +93,6 @@ def create_while_nodes(condition_name, body_name, loop_var_names, getter_name, return ret -class NameScope: - - def __init__(self): - """ we don't analyze the read only variable - because they keep the same in control flow. - """ - self.globals = set() - self.nonlocals = set() - self.args = set() - # all vars been stored, - # may be globals or non-locals - self.w_vars = set() - - def created_vars(self): - return self.w_vars - self.globals - self.nonlocals - self.args - - def write_vars(self): - return self.w_vars - - def global_vars(self): - return self.globals - - -class FunctionNameLivenessAnalysis(gast.NodeVisitor): - """ analyze the liveness of a function. - - every variables stored in this scope will be collected, - in addition with global/nonlocal information. - - 1. global variable is stored in node.var_globals. - 2. nonlocal variable is stored in node.var_nonlocals. - 3. arguments is stored in node.var_args. - - For example: - - def func(*args, **kargs): - a = 12 - global i,j - nonlocal x,y - print(a) - i = k - for m in range(10): - q = 12 - - After this visitor we have: - # node is the FunctionDef node with name: "func" - node.pd_scope = NameScope( - globals = ['i', 'j'], - nonlocals = ['x', 'y'], - args = ['args', 'kargs'], - wr_vars = ['a', 'i', 'q', 'm'] - ) - """ - - def __init__(self, root_node): - self.funcdef_stack = [] - self.visit(root_node) - - def _current_funcdef_scope(self): - return self.funcdef_stack[-1].pd_scope - - def visit_Name(self, node): - self.generic_visit(node) - write_context = (gast.Store, gast.AugStore, gast.Del) - if isinstance(node.ctx, write_context): - self._current_funcdef_scope().w_vars.add(node.id) - - def visit_FunctionDef(self, node): - setattr(node, 'pd_scope', NameScope()) - self.funcdef_stack.append(node) - self._current_funcdef_scope().args |= set( - self._get_argument_names(node)) - self.generic_visit(node) - self.funcdef_stack.pop() - - def visit_Global(self, node): - self._current_funcdef_scope().globals |= set(node.names) - - def visit_Nonlocal(self, node): - self._current_funcdef_scope().nonlocals |= set(node.names) - - def _get_argument_names(self, node): - """ get all arguments name in the functiondef node. - this node is local to the function and shouldn't - be created. - """ - assert isinstance( - node, gast.FunctionDef), "Input node is not function define node" - names = [a for a in node.args.args] - names.append(node.args.vararg) - names.append(node.args.kwarg) - names = [i.id for i in names if i is not None] - return names - - class NameVisitor(gast.NodeVisitor): ''' Analysis name liveness for loop transformer @@ -665,7 +570,7 @@ def get_for_stmt_nodes(self, node): if ARGS_NAME in nonlocal_names: nonlocal_names.remove(ARGS_NAME) - nonlocal_stmt_node = [create_nonlocal_stmt_node(nonlocal_names)] + nonlocal_stmt_node = create_nonlocal_stmt_nodes(nonlocal_names) # 4. append init statements new_stmts.extend(init_stmts) @@ -737,7 +642,7 @@ def get_while_stmt_nodes(self, node): if ARGS_NAME in nonlocal_names: nonlocal_names.remove(ARGS_NAME) - nonlocal_stmt_node = [create_nonlocal_stmt_node(nonlocal_names)] + nonlocal_stmt_node = create_nonlocal_stmt_nodes(nonlocal_names) # Python can create variable in loop and use it out of loop, E.g. # diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py index 072d22d47e029..2b95f346ae275 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py @@ -43,7 +43,9 @@ # solve it in dy2stat, we put float64 value with this magic number at Static # graph as a place holder to indicate the returning placeholder means no value # should return. -RETURN_NO_VALUE_MAGIC_NUM = 1.77113e+279 + +# Assign not support float64, use float32 value as magic number. +RETURN_NO_VALUE_MAGIC_NUM = 1.77113e+27 RETURN_NO_VALUE_VAR_NAME = "__no_value_return_var" @@ -216,44 +218,17 @@ def visit_FunctionDef(self, node): ctx=gast.Load(), annotation=None, type_comment=None))) - init_names = [ - unique_name.generate(RETURN_VALUE_INIT_NAME) - for i in range(max_return_length) - ] - assign_zero_nodes = [ - create_fill_constant_node(iname, 0.0) for iname in init_names - ] - if len(init_names) == 1: - return_value_nodes = gast.Name(id=init_names[0], - ctx=gast.Load(), - annotation=None, - type_comment=None) - else: - # We need to initialize return value as a tuple because control - # flow requires some inputs or outputs have same structure - return_value_nodes = gast.Tuple(elts=[ - gast.Name(id=iname, - ctx=gast.Load(), - annotation=None, - type_comment=None) for iname in init_names - ], - ctx=gast.Load()) assign_return_value_node = gast.Assign(targets=[ gast.Name(id=value_name, ctx=gast.Store(), annotation=None, type_comment=None) ], - value=return_value_nodes) + value=gast.Constant( + kind=None, value=None)) node.body.insert(0, assign_return_value_node) - node.body[:0] = assign_zero_nodes # Prepend no value placeholders - for name in self.return_no_value_name[node]: - assign_no_value_node = create_fill_constant_node( - name, RETURN_NO_VALUE_MAGIC_NUM) - node.body.insert(0, assign_no_value_node) - self.function_def.pop() return node @@ -340,74 +315,21 @@ def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name, cur_func_node = self.function_def[-1] return_length = get_return_size(return_node) - if return_length < max_return_length: - # In this case we should append RETURN_NO_VALUE placeholder - # - # max_return_length must be >= 1 here because return_length will be - # 0 at least. + # In this case we should NOT append RETURN_NO_VALUE placeholder + if return_node.value is not None: + cur_func_node = self.function_def[-1] if self.return_value_name[cur_func_node] is None: self.return_value_name[cur_func_node] = unique_name.generate( RETURN_VALUE_PREFIX) - no_value_names = [ - unique_name.generate(RETURN_NO_VALUE_VAR_NAME) - for j in range(max_return_length - return_length) - ] - self.return_no_value_name[cur_func_node].extend(no_value_names) - - # Handle tuple/non-tuple case - if max_return_length == 1: - assign_nodes.append( - gast.Assign(targets=[ - gast.Name(id=self.return_value_name[cur_func_node], - ctx=gast.Store(), - annotation=None, - type_comment=None) - ], - value=gast.Name(id=no_value_names[0], - ctx=gast.Load(), - annotation=None, - type_comment=None))) - else: - # max_return_length > 1 which means we should assign tuple - fill_tuple = [ - gast.Name(id=n, - ctx=gast.Load(), + assign_nodes.append( + gast.Assign(targets=[ + gast.Name(id=self.return_value_name[cur_func_node], + ctx=gast.Store(), annotation=None, - type_comment=None) for n in no_value_names - ] - if return_node.value is not None: - if isinstance(return_node.value, gast.Tuple): - fill_tuple[:0] = return_node.value.elts - else: - fill_tuple.insert(0, return_node.value) - - assign_nodes.append( - gast.Assign(targets=[ - gast.Name(id=self.return_value_name[cur_func_node], - ctx=gast.Store(), - annotation=None, - type_comment=None) - ], - value=gast.Tuple(elts=fill_tuple, - ctx=gast.Load()))) - else: - # In this case we should NOT append RETURN_NO_VALUE placeholder - if return_node.value is not None: - cur_func_node = self.function_def[-1] - if self.return_value_name[cur_func_node] is None: - self.return_value_name[ - cur_func_node] = unique_name.generate( - RETURN_VALUE_PREFIX) - - assign_nodes.append( - gast.Assign(targets=[ - gast.Name(id=self.return_value_name[cur_func_node], - ctx=gast.Store(), - annotation=None, - type_comment=None) - ], - value=return_node.value)) + type_comment=None) + ], + value=return_node.value)) stmt_list[i:] = assign_nodes return True diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 2191046ad1d3e..1c507ab23c311 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -30,8 +30,9 @@ import paddle from paddle.fluid import unique_name from paddle.fluid.data_feeder import convert_dtype -from paddle.fluid.layer_helper import LayerHelper from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.layers import assign # Note(Aurelius): Do not forget the dot `.` to distinguish other # module such as paddlenlp. @@ -64,6 +65,34 @@ def visit(self, node): return ret +# imp is deprecated in python3 +from importlib.machinery import SourceFileLoader + +dygraph_class_to_static_api = { + "CosineDecay": "cosine_decay", + "ExponentialDecay": "exponential_decay", + "InverseTimeDecay": "inverse_time_decay", + "NaturalExpDecay": "natural_exp_decay", + "NoamDecay": "noam_decay", + "PiecewiseDecay": "piecewise_decay", + "PolynomialDecay": "polynomial_decay", +} + +FOR_ITER_INDEX_PREFIX = '__for_loop_var_index' +FOR_ITER_TUPLE_PREFIX = '__for_loop_iter_tuple' +FOR_ITER_TUPLE_INDEX_PREFIX = '__for_loop_iter_tuple_index' +FOR_ITER_VAR_LEN_PREFIX = '__for_loop_var_len' +FOR_ITER_VAR_NAME_PREFIX = '__for_loop_iter_var' +FOR_ITER_ZIP_TO_LIST_PREFIX = '__for_loop_iter_zip' + +# FullArgSpec is valid from Python3. Defined a Namedtuple to +# to make it available in Python2. +FullArgSpec = collections.namedtuple('FullArgSpec', [ + 'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults', + 'annotations' +]) + + def data_layer_not_check(name, shape, dtype='float32', lod_level=0): """ This function creates a Tensor on the global block. The created Tensor @@ -99,42 +128,32 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0): if shape[i] is None: shape[i] = -1 - return helper.create_variable(name=name, - shape=shape, - dtype=dtype, - type=core.VarDesc.VarType.LOD_TENSOR, - stop_gradient=True, - lod_level=lod_level, - is_data=True, - need_check_feed=False) - + return helper.create_global_variable(name=name, + shape=shape, + dtype=dtype, + type=core.VarDesc.VarType.LOD_TENSOR, + stop_gradient=True, + lod_level=lod_level, + is_data=True, + need_check_feed=False) -# imp is deprecated in python3 -from importlib.machinery import SourceFileLoader -dygraph_class_to_static_api = { - "CosineDecay": "cosine_decay", - "ExponentialDecay": "exponential_decay", - "InverseTimeDecay": "inverse_time_decay", - "NaturalExpDecay": "natural_exp_decay", - "NoamDecay": "noam_decay", - "PiecewiseDecay": "piecewise_decay", - "PolynomialDecay": "polynomial_decay", -} +def create_undefined_var_like(variable): + """ create a undefined var with the same shape and dtype like varaible. + """ + from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM + var = data_layer_not_check(unique_name.generate("undefined_var"), + variable.shape, variable.dtype) + assign(RETURN_NO_VALUE_MAGIC_NUM, var) + return var -FOR_ITER_INDEX_PREFIX = '__for_loop_var_index' -FOR_ITER_TUPLE_PREFIX = '__for_loop_iter_tuple' -FOR_ITER_TUPLE_INDEX_PREFIX = '__for_loop_iter_tuple_index' -FOR_ITER_VAR_LEN_PREFIX = '__for_loop_var_len' -FOR_ITER_VAR_NAME_PREFIX = '__for_loop_iter_var' -FOR_ITER_ZIP_TO_LIST_PREFIX = '__for_loop_iter_zip' -# FullArgSpec is valid from Python3. Defined a Namedtuple to -# to make it available in Python2. -FullArgSpec = collections.namedtuple('FullArgSpec', [ - 'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults', - 'annotations' -]) +def create_undefined_variable(): + from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM + var = data_layer_not_check(unique_name.generate("undefined_var"), [1], + "float64") + assign(RETURN_NO_VALUE_MAGIC_NUM, var) + return var class UndefinedVar: @@ -147,6 +166,12 @@ def check(self): "local variable '{}' should be created before using it.") +class Dygraph2StaticException(Exception): + + def __init__(self, message): + super().__init__(message) + + def saw(x): if isinstance(x, UndefinedVar): return x.check() @@ -1600,6 +1625,209 @@ def slice_is_num(slice_node): return False +class NameScope: + + def __init__(self): + """ + A NameScope is a object which manager all the variable names. + only FunctionDef and Controlflow node will have a namescope property. + + type can be "function" and "controlflow" + + we don't analyze the read only variable because they don't affect the analysis. + """ + self.globals = set() + self.nonlocals = set() + self.args = set() + self.father = None # point to the nearest function name scope. + self.w_vars = set() # all qualified + normal names been stored + self.created = set( + ) # useful for control flow compatibility. may be remove later + + def set_father(self, father): + self.father = father + + def existed_vars(self): + """ vars existing in current scope. + they must not contain qualified names. + """ + local_vars = self.w_vars - self.globals - self.nonlocals - self.args + return set(filter(lambda x: '.' not in x, local_vars)) + + def created_vars(self): + return self.created + + def modified_vars(self): + # may be globals / non-locals / args / qualified names and created_vars + return self.w_vars + + def control_flow_vars(self): + valid_names = self.w_vars + tmp = self.father.global_vars & valid_names, + return {"global": tmp, "nonlocal": self.w_vars - tmp} + + def global_vars(self): + return self.globals + + def merge_from(self, name_scope): + self.globals |= name_scope.globals + self.nonlocals |= name_scope.nonlocals + self.args |= name_scope.args + self.w_vars |= name_scope.w_vars + + +class FunctionNameLivenessAnalysis(gast.NodeVisitor): + """ analyze the liveness of a function. + + every variables stored in this scope will be collected, + in addition with global/nonlocal information. + + 1. global variable is stored in node.var_globals. + 2. nonlocal variable is stored in node.var_nonlocals. + 3. arguments is stored in node.var_args. + + For example: + + def func(*args, **kargs): + a = 12 + global i,j + nonlocal x,y + print(a) + i = k + for m in range(10): + q = 12 + + After this visitor we have: + # node is the FunctionDef node with name: "func" + node.pd_scope = NameScope( + globals = ['i', 'j'], + nonlocals = ['x', 'y'], + args = ['args', 'kargs'], + wr_vars = ['a', 'i', 'q', 'm'] + ) + """ + + def __init__(self, root_node): + self.scope_node_stack = [] # controlflow, functiondef node + self.visit(root_node) + + def _reset_name_scope(self, node): + # always reset the node as empty namescope. + setattr(node, "pd_scope", NameScope()) + + def _get_name_scope(self, node): + if not hasattr(node, "pd_scope"): + setattr(node, "pd_scope", NameScope()) + return node.pd_scope + + def _current_name_scope(self): + return self._get_name_scope(self.scope_node_stack[-1]) + + def _father_name_scope(self): + if len(self.scope_node_stack) == 1: return None + return self._get_name_scope(self.scope_node_stack[-2]) + + def _nearest_function_scope(self): + if len(self.scope_node_stack) == 1: return None + for node in self.scope_node_stack[-2::-1]: + if isinstance(node, gast.FunctionDef): + return self._get_name_scope(node) + + def visit_Name(self, node): + self.generic_visit(node) + write_context = (gast.Store, gast.AugStore, gast.Del) + if isinstance(node.ctx, write_context): + self._current_name_scope().w_vars.add(node.id) + + def visit_FunctionDef(self, node): + + def pre_func(): + self._current_name_scope().args |= set( + self._get_argument_names(node)) + + def post_func(): + """ NOTE: why we need merge w_vars here ? + because we do ifelse_transformer after loop_transformer. Loops will changed into functioons. but we know this function will be called in if. so we add w_vars to father function scope. + """ + from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import WHILE_CONDITION_PREFIX, WHILE_BODY_PREFIX, FOR_CONDITION_PREFIX, FOR_BODY_PREFIX + from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import TRUE_FUNC_PREFIX, FALSE_FUNC_PREFIX + control_flow_function_def = [ + WHILE_BODY_PREFIX, WHILE_BODY_PREFIX, FOR_CONDITION_PREFIX, + FOR_BODY_PREFIX, TRUE_FUNC_PREFIX, FALSE_FUNC_PREFIX + ] + + def is_control_flow_def_node(): + for prefix in control_flow_function_def: + if node.name.startswith(prefix): return True + return False + + if self._father_name_scope() and is_control_flow_def_node(): + self._father_name_scope().w_vars |= self._current_name_scope( + ).w_vars + + self._visit_scope_node(node, pre_func, post_func) + + def _visit_scope_node(self, node, pre_func, post_func): + """ scope node main visit logic. + pre_func and post_func is callbacks + """ + self._reset_name_scope(node) + self.scope_node_stack.append(node) + self._current_name_scope().father = self._nearest_function_scope() + if pre_func: pre_func() + self.generic_visit(node) + if post_func: post_func() + self.scope_node_stack.pop() + + def _visit_controlflow_node(self, node): + + def post_func(): + self._father_name_scope().merge_from(self._current_name_scope()) + self._current_name_scope().created = self._nearest_function_scope( + ).existed_vars() - node.before_created + + def pre_func(): + setattr(node, "before_created", + self._nearest_function_scope().existed_vars()) + + self._visit_scope_node(node, pre_func, post_func) + + def visit_For(self, node): + self._visit_controlflow_node(node) + + def visit_While(self, node): + self._visit_controlflow_node(node) + + def visit_If(self, node): + self._visit_controlflow_node(node) + + def visit_Global(self, node): + self._current_name_scope().globals |= set(node.names) + + def visit_Nonlocal(self, node): + self._current_name_scope().nonlocals |= set(node.names) + + def visit_Attribute(self, node): + self.generic_visit(node) + write_context = (gast.Store, gast.AugStore, gast.Del) + if isinstance(node.ctx, write_context): + name = ast_to_source_code(node).strip() + self._current_name_scope().w_vars.add(name) + + def _get_argument_names(self, node): + """ get all arguments name in the functiondef node. + this node is local to the function and shouldn't + be created. + """ + assert isinstance( + node, gast.FunctionDef), "Input node is not function define node" + names = [a for a in node.args.args] + names.append(node.args.vararg) + names.append(node.args.kwarg) + names = [i.id for i in names if i is not None] + return names + + def create_get_args_node(names): """ Create get_args function as follows: @@ -1617,21 +1845,24 @@ def {func_name}(): return gast.parse(textwrap.dedent(func_def)).body[0] assert isinstance(names, (list, tuple)) - if not names: - return empty_node() - mapped = list(filter(lambda n: '.' not in n, names)) nonlocal_names = sorted( mapped, key=mapped.index) # to keep the order, we can't use set() to unique + if not names: + return empty_node() + if not nonlocal_names: + nonlocal_vars = "\n" + else: + nonlocal_vars = "nonlocal " + ",".join(nonlocal_names) template = """ def {func_name}(): - nonlocal {nonlocal_vars} + {nonlocal_vars} return {vars}, """ func_def = template.format( func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX), - nonlocal_vars=','.join(nonlocal_names), + nonlocal_vars=nonlocal_vars, vars=",".join(names)) return gast.parse(textwrap.dedent(func_def)).body[0] @@ -1654,32 +1885,37 @@ def {func_name}({args}): return gast.parse(textwrap.dedent(func_def)).body[0] assert isinstance(names, (list, tuple)) - if not names: - return empty_node() - mapped = list(filter(lambda n: '.' not in n, names)) nonlocal_names = sorted( mapped, key=mapped.index) # to keep the order, we can't use set() to unique + if not names: + return empty_node() + if not nonlocal_names: + nonlocal_vars = "\n" + else: + nonlocal_vars = "nonlocal " + ",".join(nonlocal_names) template = """ def {func_name}({args}): - nonlocal {nonlocal_vars} + {nonlocal_vars} {vars}, = {args} """ func_def = template.format( func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX), args=ARGS_NAME, - nonlocal_vars=','.join(nonlocal_names), + nonlocal_vars=nonlocal_vars, vars=",".join(names)) return gast.parse(textwrap.dedent(func_def)).body[0] -def create_nonlocal_stmt_node(names): +def create_nonlocal_stmt_nodes(names): assert isinstance(names, (list, tuple)) mapped = list(filter(lambda n: '.' not in n, names)) names = sorted( mapped, key=mapped.index) # to keep the order, we can't use set() to unique + if not names: + return [] func_code = "nonlocal {}".format(','.join(names)) - return gast.parse(func_code).body[0] + return [gast.parse(func_code).body[0]] diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py index 28d7cff8cb0ca..5593658ee6232 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py @@ -20,7 +20,7 @@ from paddle.utils import gast from paddle.fluid import unique_name from paddle.fluid.framework import Variable -from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, data_layer_not_check +from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, create_undefined_variable __all__ = [ 'create_bool_as_type', @@ -62,9 +62,10 @@ def to_static_variable(x): return paddle.full(shape=[1], dtype='float64', fill_value=x) if isinstance(x, six.integer_types): return paddle.full(shape=[1], dtype='int64', fill_value=x) - if isinstance(x, UndefinedVar): - return data_layer_not_check(unique_name.generator("loop_undefined_var"), - [-1]) + if isinstance(x, UndefinedVar) or x is None: + """ for early return case, we need a variable to represent None, current we use data_layer_not_check. + """ + return create_undefined_variable() return x diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 4c3a4e5e8fcb1..87010bc616a64 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -21,7 +21,7 @@ from ..framework import Program, Variable, Operator, _non_static_mode, static_only, _in_legacy_dygraph, in_dygraph_mode from ..layer_helper import LayerHelper, unique_name from .nn import logical_and, logical_not, logical_or -from .utils import assert_same_structure, map_structure, hold_mutable_vars, copy_mutable_vars +from .utils import assert_same_structure, map_structure, hold_mutable_vars, copy_mutable_vars, padding_to_same_structure, is_sequence, pack_sequence_as, flatten, to_sequence import numpy import warnings import six @@ -107,9 +107,16 @@ def select_input(inputs, mask): def select_input_with_buildin_type(inputs, mask): from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable + from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, create_undefined_var_like support_ret_buildin_type = (bool, float, six.integer_types) false_var, true_var = inputs + if isinstance(false_var, UndefinedVar) and isinstance( + true_var, UndefinedVar): + """ None -> UndefinedVar, so the real value is a [None, UndefinedVar] or [None, None], we just return None. + """ + return None + if isinstance(false_var, Variable) and isinstance(true_var, Variable): return select_input(inputs, mask) @@ -132,6 +139,27 @@ def select_input_with_buildin_type(inputs, mask): "Return results from different branches in cond are not same type: " "false_var returned by fasle_fn is '{}' and true_var of true_fn is " "'{}'".format(type(false_var), type(true_var))) + elif ((isinstance(false_var, UndefinedVar) + and isinstance(true_var, (Variable, ) + support_ret_buildin_type)) + or (isinstance(true_var, UndefinedVar) + and isinstance(false_var, + (Variable, ) + support_ret_buildin_type))): + + def create_var_if_not_undefined_var(a): + if isinstance(a, UndefinedVar): return a + return to_static_variable(a) + + def create_like_if_undefined_var(a, b): + if isinstance(a, UndefinedVar): return create_undefined_var_like(b) + return a + + # TODO(xiongkun): add warning here. + true_var, false_var = create_var_if_not_undefined_var( + true_var), create_var_if_not_undefined_var(false_var) + inputs = [ + create_like_if_undefined_var(false_var, true_var), + create_like_if_undefined_var(true_var, false_var) + ] else: raise TypeError( "Unsupported return type of true_fn and false_fn in cond: false_var " @@ -1158,8 +1186,11 @@ def assign_skip_lod_tensor_array(input, output): """ Assign input to output, but skip the process of copying LoDTensorArray unless it's created in while_block. """ - if not isinstance(input, Variable) and not isinstance(input, core.VarBase): - output = input + if not isinstance(input, (Variable, core.VarBase)): + if isinstance(output, Variable): + assign(input, output) + else: + output = input return if input.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY: @@ -2377,7 +2408,7 @@ def copy_var_to_parent_block(var, layer_helper): return parent_block_var -def cond(pred, true_fn=None, false_fn=None, name=None): +def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None): """ This API returns ``true_fn()`` if the predicate ``pred`` is true else ``false_fn()`` . Users could also set ``true_fn`` or ``false_fn`` to @@ -2423,6 +2454,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None): true. The default value is ``None`` . false_fn(callable, optional): A callable to be performed if ``pred`` is false. The default value is ``None`` . + return_names: A list of strings to represents the name of returned vars. useful to debug. name(str, optional): The default value is ``None`` . Normally users don't have to set this parameter. For more information, please refer to :ref:`api_guide_Name` . @@ -2536,12 +2568,30 @@ def false_func(): "true_fn returns non-None while false_fn returns None") # Merge ture and false output if they are not None - try: - assert_same_structure(true_output, false_output, check_types=False) - except ValueError as e: + if return_names is None: + return_names = ["no name"] * len(to_sequence(true_output)) + else: + """ + dy2static will set the return_names and expand the return values to UndefinedVar. + """ + true_output, false_output = expand_undefined_var( + true_output, false_output, return_names) + true_output, false_output = change_none_to_undefinedvar( + true_output, false_output) + if len(to_sequence(true_output)) != len(to_sequence(false_output)): raise ValueError( - "Incompatible return values of true_fn and false_fn in cond: {}". - format(e)) + "true fn returns {} vars, but false fn returns {} vars, which is not equals" + .format(len(to_sequence(true_output)), + len(to_sequence(false_output)))) + for true_out, false_out, return_name in zip(to_sequence(true_output), + to_sequence(false_output), + to_sequence(return_names)): + try: + assert_same_structure(true_out, false_out, check_types=False) + except ValueError as e: + raise ValueError( + "Incompatible return values of `{}` in true_fn and false_fn in cond: {}" + .format(return_name, e)) mask = cast(pred, dtype='int32') merge_func = lambda false_var, true_var: select_input_with_buildin_type( @@ -2550,6 +2600,41 @@ def false_func(): return merged_output +def change_none_to_undefinedvar(nest1, nest2): + from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar + + def map_fn(x): + if x is None: return UndefinedVar("padding") + return x + + nest1_out = pack_sequence_as(nest1, list(map(map_fn, flatten(nest1)))) + nest2_out = pack_sequence_as(nest2, list(map(map_fn, flatten(nest2)))) + return nest1_out, nest2_out + + +def expand_undefined_var(nest1, nest2, names): + from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar + from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_VALUE_PREFIX + + def pack_undefined_var_as(seq): + return pack_sequence_as(seq, + [UndefinedVar("padding") for i in flatten(seq)]) + + def map_fn(n1, n2, name): + if not name.startswith(RETURN_VALUE_PREFIX) and (isinstance( + n1, UndefinedVar) or n1 is None): + return pack_undefined_var_as(n2) + return n1 + + nest1_out = list( + map(map_fn, to_sequence(nest1), to_sequence(nest2), to_sequence(names))) + nest2_out = list( + map(map_fn, to_sequence(nest2), to_sequence(nest1), to_sequence(names))) + if not is_sequence(nest1): nest1_out = nest1_out[0] + if not is_sequence(nest2): nest2_out = nest2_out[0] + return nest1_out, nest2_out + + def _error_message(what, arg_name, op_name, right_value, error_value): error_message = "{what} of '{arg_name}' in {op_name} must be " \ "{right_value}, but received: {error_value}.".format( diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py index ca11727221f23..be8045b7bb8a5 100644 --- a/python/paddle/fluid/layers/utils.py +++ b/python/paddle/fluid/layers/utils.py @@ -125,6 +125,13 @@ def _yield_flat_nest(nest): yield n +def to_sequence(nest): + if is_sequence(nest): + return nest + else: + return [nest] + + def flatten(nest): """ :alias_main: paddle.flatten @@ -260,6 +267,26 @@ def _recursive_assert_same_structure(nest1, nest2, check_types): _recursive_assert_same_structure(n1, n2, check_types) +def padding_to_same_structure(nest1, nest2, obj=None): + + def _padding_to_same_structure_single(value, obj): + + def change_none_to_obj(x): + if x is None: return obj + return x + + if is_sequence(value): + value = pack_sequence_as( + value, [change_none_to_obj(item) for item in flatten(value)]) + else: + value = change_none_to_obj(value) + return value + + nest1 = _padding_to_same_structure_single(nest1, obj) + nest2 = _padding_to_same_structure_single(nest2, obj) + return nest1, nest2 + + def assert_same_structure(nest1, nest2, check_types=True): """ Confirm two nested structures with the same structure. diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py index d0d024fb78624..b37accce9d1b8 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py @@ -117,7 +117,7 @@ def dyfunc_with_if_else_early_return1(): b = paddle.zeros([3, 3]) return a, b a = paddle.zeros([2, 2]) + 1 - return a + return a, None def dyfunc_with_if_else_early_return2(): @@ -131,7 +131,7 @@ def dyfunc_with_if_else_early_return2(): d = paddle.zeros([3, 3]) + 1 return c, d e = paddle.zeros([2, 2]) + 3 - return e + return e, None def dyfunc_with_if_else_with_list_geneator(x): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py index 8c7f301e9ed55..b544ca9bd8344 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py @@ -18,6 +18,7 @@ from __future__ import print_function import numpy as np +import paddle import paddle.fluid as fluid from paddle.fluid import ParamAttr from paddle.fluid import layers @@ -360,7 +361,7 @@ def beam_search(self, inputs): predicted_ids = [] parent_ids = [] - for step_idx in range(self.beam_max_step_num): + for step_idx in range(paddle.to_tensor(self.beam_max_step_num)): if fluid.layers.reduce_sum(1 - beam_finished).numpy()[0] == 0: break step_input = self._merge_batch_beams(step_input) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py index 6b4b2d46a12f6..9edff1859e41a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py @@ -19,11 +19,29 @@ import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.jit import declarative +from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator +from paddle.fluid.dygraph.dygraph_to_static.utils import Dygraph2StaticException SEED = 2020 np.random.seed(SEED) +class TestDy2staticException(unittest.TestCase): + + def setUp(self): + self.x = np.random.random([10, 16]).astype('float32') + self.dyfunc = None + self.error = "Your if/else have different number of return value." + + def test_error(self): + if self.dyfunc: + with self.assertRaisesRegex(Dygraph2StaticException, self.error): + ProgramTranslator().enable(True) + self.assertTrue(declarative(self.dyfunc)(self.x)) + paddle.fluid.dygraph.base._in_declarative_mode_ = False + ProgramTranslator().enable(False) + + def test_continue_in_for(x): x = fluid.dygraph.to_variable(x) for i in range(10): @@ -265,10 +283,12 @@ def init_dygraph_func(self): self.dygraph_func = while_loop_class_var -class TestOptimBreakInFor(TestContinueInWhile): +class TestOptimBreakInFor(TestDy2staticException): - def init_dygraph_func(self): - self.dygraph_func = test_optim_break_in_for + def setUp(self): + self.x = np.random.random([10, 16]).astype('float32') + self.dyfunc = test_optim_break_in_for + self.error = "python while pred change from bool to variable." class TestOptimBreakInWhile(TestContinueInWhile): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py index 7986fb1cbae48..f588008b4625f 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py @@ -17,20 +17,25 @@ import unittest import paddle -from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import FunctionNameLivenessAnalysis +from paddle.fluid.dygraph.dygraph_to_static.utils import FunctionNameLivenessAnalysis from paddle.utils import gast import inspect class JudgeVisitor(gast.NodeVisitor): - def __init__(self, ans): + def __init__(self, ans, mod): self.ans = ans + self.mod = mod def visit_FunctionDef(self, node): scope = node.pd_scope expected = self.ans.get(node.name, set()) - assert scope.created_vars() == expected, "Not Equals." + exp_mod = self.mod.get(node.name, set()) + assert scope.existed_vars() == expected, "Not Equals." + assert scope.modified_vars( + ) == exp_mod, "Not Equals in function:{} . expect {} , but get {}".format( + node.name, exp_mod, scope.modified_vars()) self.generic_visit(node) @@ -108,12 +113,31 @@ def init_dygraph_func(self): }, ] + self.modified_var = [ + { + 'func': set('ki'), + 'test_nonlocal': set('i') + }, + { + 'func': set({'i'}), + 'test_global': set({"t"}) + }, + { + 'func': set('i'), + }, + { + 'func': set('i'), + 'test_normal_argument': set('x') + }, + ] + def test_main(self): - for ans, func in zip(self.answer, self.all_dygraph_funcs): + for mod, ans, func in zip(self.modified_var, self.answer, + self.all_dygraph_funcs): test_func = inspect.getsource(func) gast_root = gast.parse(test_func) name_visitor = FunctionNameLivenessAnalysis(gast_root) - JudgeVisitor(ans).visit(gast_root) + JudgeVisitor(ans, mod).visit(gast_root) def TestClosureAnalysis_Attribute_func(): @@ -128,6 +152,10 @@ def init_dygraph_func(self): self.all_dygraph_funcs = [TestClosureAnalysis_Attribute_func] self.answer = [{"TestClosureAnalysis_Attribute_func": set({'i'})}] + self.modified_var = [{ + "TestClosureAnalysis_Attribute_func": + set({'i', 'self.current.function'}) + }] if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py index 1f1624280a023..acfd29102691a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py @@ -20,6 +20,7 @@ import paddle from paddle.fluid.dygraph.jit import declarative from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator +from paddle.fluid.dygraph.dygraph_to_static.utils import Dygraph2StaticException import paddle.fluid.core as core from ifelse_simple_func import * @@ -32,6 +33,22 @@ place = fluid.CPUPlace() +class TestDy2staticException(unittest.TestCase): + + def setUp(self): + self.x = np.random.random([10, 16]).astype('float32') + self.dyfunc = None + self.error = "Your if/else have different number of return value." + + def test_error(self): + if self.dyfunc: + with self.assertRaisesRegex(Dygraph2StaticException, self.error): + ProgramTranslator().enable(True) + self.assertTrue(declarative(self.dyfunc)(self.x)) + paddle.fluid.dygraph.base._in_declarative_mode_ = False + ProgramTranslator().enable(False) + + class TestDygraphIfElse(unittest.TestCase): """ TestCase for the transformation from control flow `if/else` @@ -417,16 +434,12 @@ def test_ast_to_func(self): self.assertIsInstance(self.out[1], int) -class TestDy2StIfElseRetInt2(TestDy2StIfElseRetInt1): +class TestDy2StIfElseRetInt2(TestDy2staticException): def setUp(self): self.x = np.random.random([5]).astype('float32') + self.error = "Your if/else have different number of return value." self.dyfunc = dyfunc_ifelse_ret_int2 - self.out = self.get_dy2stat_out() - - def test_ast_to_func(self): - self.assertIsInstance(self.out[0], (paddle.Tensor, core.eager.Tensor)) - self.assertIsInstance(self.out[1], (paddle.Tensor, core.eager.Tensor)) class TestDy2StIfElseRetInt3(TestDy2StIfElseRetInt1): @@ -448,7 +461,7 @@ def setUp(self): def test_ast_to_func(self): ProgramTranslator().enable(True) - with self.assertRaises(TypeError): + with self.assertRaises(Dygraph2StaticException): static_func = paddle.jit.to_static(self.dyfunc) out = static_func(self.x) # Why need set `_in_declarative_mode_` here? diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py index 826063cf67392..97043fd7ba688 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,264 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from __future__ import print_function - -import unittest -import textwrap -from paddle.utils import gast -from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import get_name_ids -from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor -from paddle.fluid.dygraph.dygraph_to_static.static_analysis import NodeVarType -from paddle.fluid.dygraph.dygraph_to_static.utils import is_control_flow_to_transform - - -class TestGetNameIds(unittest.TestCase): - """ - Test for parsing the ast.Name list from the ast.Nodes - """ - - def setUp(self): - self.source = """ - def test_fn(x): - return x+1 - """ - self.all_name_ids = {'x': [gast.Param(), gast.Load()]} - - def test_get_name_ids(self): - source = textwrap.dedent(self.source) - root = gast.parse(source) - all_name_ids = get_name_ids([root]) - self.assertDictEqual(self.transfer_dict(self.all_name_ids), - self.transfer_dict(all_name_ids)) - - def transfer_dict(self, name_ids_dict): - new_dict = {} - for name, ctxs in name_ids_dict.items(): - new_dict[name] = [type(ctx) for ctx in ctxs] - return new_dict - - -class TestGetNameIds2(TestGetNameIds): - - def setUp(self): - self.source = """ - def test_fn(x, y): - a = 1 - x = y + a - if x > y: - z = x * x - z = z + a - else: - z = y * y - return z - """ - self.all_name_ids = { - 'x': - [gast.Param(), - gast.Store(), - gast.Load(), - gast.Load(), - gast.Load()], - 'a': [gast.Store(), gast.Load(), - gast.Load()], - 'y': [ - gast.Param(), - gast.Load(), - gast.Load(), - gast.Load(), - gast.Load(), - ], - 'z': [ - gast.Store(), - gast.Load(), - gast.Store(), - gast.Store(), - gast.Load(), - ] - } - - -class TestGetNameIds3(TestGetNameIds): - - def setUp(self): - self.source = """ - def test_fn(x, y): - z = 1 - if x > y: - z = x * x - z = z + y - return z - """ - self.all_name_ids = { - 'x': [ - gast.Param(), - gast.Load(), - gast.Load(), - gast.Load(), - ], - 'y': [ - gast.Param(), - gast.Load(), - gast.Load(), - ], - 'z': [ - gast.Store(), - gast.Store(), - gast.Load(), - gast.Store(), - gast.Load(), - ] - } - - -class TestIsControlFlowIf(unittest.TestCase): - - def check_false_case(self, code): - code = textwrap.dedent(code) - node = gast.parse(code) - node_test = node.body[0].value - - self.assertFalse(is_control_flow_to_transform(node_test)) - - def test_expr(self): - # node is not ast.Compare - self.check_false_case("a+b") - - def test_expr2(self): - # x is a Tensor. - node = gast.parse("a + x.numpy()") - node_test = node.body[0].value - self.assertTrue(is_control_flow_to_transform(node_test)) - - def test_is_None(self): - self.check_false_case("x is None") - - def test_is_None2(self): - self.check_false_case("fluid.layers.sum(x) is None") - - def test_is_None3(self): - self.check_false_case("fluid.layers.sum(x).numpy() != None") - - def test_is_None4(self): - node = gast.parse("fluid.layers.sum(x) and 2>1") - node_test = node.body[0].value - - self.assertTrue(is_control_flow_to_transform(node_test)) - - def test_if(self): - node = gast.parse("x.numpy()[1] > 1") - node_test = node.body[0].value - - self.assertTrue(is_control_flow_to_transform(node_test)) - - def test_if_with_and(self): - node = gast.parse("x and 1 < x.numpy()[1]") - node_test = node.body[0].value - - self.assertTrue(is_control_flow_to_transform(node_test)) - - def test_if_with_or(self): - node = gast.parse("1 < fluid.layers.sum(x).numpy()[2] or x+y < 0") - node_test = node.body[0].value - - self.assertTrue(is_control_flow_to_transform(node_test)) - - def test_shape(self): - code = """ - def foo(x): - batch_size = fluid.layers.shape(x) - if batch_size[0] > 16: - x = x + 1 - return x - """ - code = textwrap.dedent(code) - node = gast.parse(code) - static_analysis_visitor = StaticAnalysisVisitor(node) - test_node = node.body[0].body[1].test - - self.assertTrue( - is_control_flow_to_transform(test_node, static_analysis_visitor)) - - def test_shape_with_andOr(self): - code = """ - def foo(x): - batch_size = fluid.layers.shape(x) - if x is not None and batch_size[0] > 16 or 2 > 1: - x = x + 1 - return x - """ - code = textwrap.dedent(code) - node = gast.parse(code) - static_analysis_visitor = StaticAnalysisVisitor(node) - test_node = node.body[0].body[1].test - - self.assertTrue( - is_control_flow_to_transform(test_node, static_analysis_visitor)) - - def test_paddle_api(self): - code = """ - def foo(x): - if fluid.layers.shape(x)[0] > 16: - x = x + 1 - return x - """ - code = textwrap.dedent(code) - node = gast.parse(code) - static_analysis_visitor = StaticAnalysisVisitor(node) - test_node = node.body[0].body[0].test - - self.assertTrue( - is_control_flow_to_transform(test_node, static_analysis_visitor)) - - def test_paddle_api_with_andOr(self): - code_or = """ - def foo(x): - if 2 > 1 and fluid.layers.shape(x)[0] > 16 or x is not None : - x = x + 1 - return x - """ - - code_and = """ - def foo(x): - if 2 > 1 and fluid.layers.shape(x)[0] > 16 and x is not None : - x = x + 1 - return x - """ - for code in [code_or, code_and]: - code = textwrap.dedent(code) - node = gast.parse(code) - static_analysis_visitor = StaticAnalysisVisitor(node) - test_node = node.body[0].body[0].test - - self.assertTrue( - is_control_flow_to_transform(test_node, - static_analysis_visitor)) - - def test_with_node_var_type_map(self): - node = gast.parse("x > 1") - node_test = node.body[0].value - - # if x is a Tensor - var_name_to_type = {"x": {NodeVarType.TENSOR}} - - self.assertTrue( - is_control_flow_to_transform(node_test, - var_name_to_type=var_name_to_type)) - - # if x is not a Tensor - var_name_to_type = {"x": {NodeVarType.NUMPY_NDARRAY}} - self.assertFalse( - is_control_flow_to_transform(node_test, - var_name_to_type=var_name_to_type)) - - def test_raise_error(self): - node = "a + b" - with self.assertRaises(Exception) as e: - self.assertRaises(TypeError, is_control_flow_to_transform(node)) - self.assertTrue( - "The type of input node must be gast.AST" in str(e.exception)) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py index e22cee2ffeea6..c7cecab04f564 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py @@ -66,11 +66,7 @@ def get_source_code(func): class StaticCode1(): def dyfunc_with_if_else(x_v, label=None): - __return_value_init_0 = paddle.full(shape=[1], - dtype='float64', - fill_value=0.0, - name='__return_value_init_0') - __return_value_0 = __return_value_init_0 + __return_value_0 = None def get_args_0(): nonlocal x_v @@ -83,51 +79,51 @@ def set_args_0(__args): def true_fn_0(): nonlocal x_v x_v = x_v - 1 - return x_v + return def false_fn_0(): nonlocal x_v x_v = x_v + 1 - return x_v + return _jst.IfElse( paddle.mean(x_v)[0] > 5, true_fn_0, false_fn_0, get_args_0, set_args_0, ('x_v', )) + __return_0 = _jst.UndefinedVar('__return_0') + __return_1 = _jst.UndefinedVar('__return_1') + loss = _jst.UndefinedVar('loss') def get_args_1(): - nonlocal __return_value_0, label, x_v - return __return_value_0, label, x_v, + nonlocal __return_0, __return_1, __return_value_0, loss + return __return_0, __return_1, __return_value_0, loss def set_args_1(__args): - nonlocal __return_value_0, label, x_v - __return_value_0, label, x_v, = __args + nonlocal __return_0, __return_1, __return_value_0, loss + __return_0, __return_1, __return_value_0, loss = __args def true_fn_1(): - nonlocal __return_value_0, label, x_v + nonlocal __return_0, __return_1, __return_value_0, loss loss = fluid.layers.cross_entropy(x_v, label) __return_0 = _jst.create_bool_as_type(label is not None, True) __return_value_0 = loss - return __return_value_0 + return def false_fn_1(): - nonlocal __return_value_0, label, x_v + nonlocal __return_0, __return_1, __return_value_0, loss __return_1 = _jst.create_bool_as_type(label is not None, True) __return_value_0 = x_v - return __return_value_0 + return _jst.IfElse(label is not None, true_fn_1, false_fn_1, get_args_1, - set_args_1, ('__return_value_0', )) + set_args_1, + ('__return_0', '__return_1', '__return_value_0', 'loss')) return __return_value_0 class StaticCode2(): # TODO: Transform return statement def dyfunc_with_if_else(x_v, label=None): - __return_value_init_1 = paddle.full(shape=[1], - dtype='float64', - fill_value=0.0, - name='__return_value_init_1') - __return_value_1 = __return_value_init_1 + __return_value_1 = None def get_args_2(): nonlocal x_v @@ -140,40 +136,44 @@ def set_args_2(__args): def true_fn_2(): nonlocal x_v x_v = x_v - 1 - return x_v + return def false_fn_2(): nonlocal x_v x_v = x_v + 1 - return x_v + return _jst.IfElse( paddle.mean(x_v)[0] > 5, true_fn_2, false_fn_2, get_args_2, set_args_2, ('x_v', )) + __return_2 = _jst.UndefinedVar('__return_2') + __return_3 = _jst.UndefinedVar('__return_3') + loss = _jst.UndefinedVar('loss') def get_args_3(): - nonlocal __return_value_1, label, x_v - return __return_value_1, label, x_v, + nonlocal __return_2, __return_3, __return_value_1, loss + return __return_2, __return_3, __return_value_1, loss def set_args_3(__args): - nonlocal __return_value_1, label, x_v - __return_value_1, label, x_v, = __args + nonlocal __return_2, __return_3, __return_value_1, loss + __return_2, __return_3, __return_value_1, loss = __args def true_fn_3(): - nonlocal __return_value_1, label, x_v + nonlocal __return_2, __return_3, __return_value_1, loss loss = fluid.layers.cross_entropy(x_v, label) __return_2 = _jst.create_bool_as_type(label is not None, True) __return_value_1 = loss - return __return_value_1 + return def false_fn_3(): - nonlocal __return_value_1, label, x_v + nonlocal __return_2, __return_3, __return_value_1, loss __return_3 = _jst.create_bool_as_type(label is not None, True) __return_value_1 = x_v - return __return_value_1 + return _jst.IfElse(label is not None, true_fn_3, false_fn_3, get_args_3, - set_args_3, ('__return_value_1', )) + set_args_3, + ('__return_2', '__return_3', '__return_value_1', 'loss')) return __return_value_1 @@ -195,6 +195,7 @@ def setUp(self): def test_decorator(self): program_translator = ProgramTranslator() code = program_translator.get_code(dyfunc_with_if_else) + #print(code) answer = get_source_code(StaticCode1.dyfunc_with_if_else) self.assertEqual( answer.replace('\n', '').replace(' ', ''), @@ -380,13 +381,13 @@ def test_ifelse_early_return1(self): answer = np.zeros([2, 2]) + 1 static_func = paddle.jit.to_static(dyfunc_with_if_else_early_return1) out = static_func() - self.assertTrue(np.allclose(answer, out.numpy())) + self.assertTrue(np.allclose(answer, out[0].numpy())) def test_ifelse_early_return2(self): answer = np.zeros([2, 2]) + 3 static_func = paddle.jit.to_static(dyfunc_with_if_else_early_return2) out = static_func() - self.assertTrue(np.allclose(answer, out.numpy())) + self.assertTrue(np.allclose(answer, out[0].numpy())) class TestRemoveCommentInDy2St(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py index a5a6b14676982..7f78788e59652 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py @@ -19,6 +19,7 @@ import paddle.fluid.core as core from paddle.jit import to_static from paddle.jit import ProgramTranslator +from paddle.fluid.dygraph.dygraph_to_static.utils import Dygraph2StaticException import unittest import numpy as np @@ -245,7 +246,7 @@ def _run(self, to_static=False): return res.numpy() return res - def test_transformed_static_result(self): + def _test_value_impl(self): dygraph_res = self._run(to_static=False) static_res = self._run(to_static=True) if isinstance(dygraph_res, tuple): @@ -264,6 +265,13 @@ def test_transformed_static_result(self): else: self.assertEqual(dygraph_res, static_res) + def test_transformed_static_result(self): + if hasattr(self, "error"): + with self.assertRaisesRegex(Dygraph2StaticException, self.error): + self._test_value_impl() + else: + self._test_value_impl() + class TestInsideFuncBase(TestReturnBase): @@ -312,12 +320,14 @@ class TestReturnDifferentLengthIfBody(TestReturnBase): def init_dygraph_func(self): self.dygraph_func = test_return_different_length_if_body + self.error = "Your if/else have different number of return value." class TestReturnDifferentLengthElse(TestReturnBase): def init_dygraph_func(self): self.dygraph_func = test_return_different_length_else + self.error = "Your if/else have different number of return value." class TestNoReturn(TestReturnBase): @@ -330,12 +340,14 @@ class TestReturnNone(TestReturnBase): def init_dygraph_func(self): self.dygraph_func = test_return_none + self.error = "Your if/else have different number of return value." class TestReturnNoVariable(TestReturnBase): def init_dygraph_func(self): self.dygraph_func = test_return_no_variable + self.error = "Your if/else have different number of return value." class TestReturnListOneValue(TestReturnBase): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index ab52d518fe7af..2239c6544f219 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -21,6 +21,7 @@ from paddle.fluid.dygraph import Embedding, Layer, LayerNorm, Linear, to_variable from paddle.fluid.dygraph.jit import dygraph_to_static_func from paddle.fluid.layers.utils import map_structure +from paddle.fluid.layers.tensor import range as pd_range def position_encoding_init(n_position, d_pos_vec): @@ -633,7 +634,7 @@ def gather(input, indices, batch_pos): value=0), } for i in range(self.n_layer)] - for i in range(max_len): + for i in pd_range(0, max_len, 1, dtype="int32"): trg_pos = layers.fill_constant(shape=trg_word.shape, dtype="int64", value=i) diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py index 1a75b30d4849a..61043cab36a68 100644 --- a/python/paddle/fluid/tests/unittests/test_cond.py +++ b/python/paddle/fluid/tests/unittests/test_cond.py @@ -25,6 +25,7 @@ from paddle.fluid.backward import append_backward from paddle.fluid.framework import Program, program_guard from simple_nets import simple_fc_net_with_inputs, batchnorm_fc_with_inputs +import paddle np.random.seed(123) @@ -41,6 +42,8 @@ def test_return_single_var(self): return -1 """ + paddle.enable_static() + def true_func(): return layers.fill_constant(shape=[2, 3], dtype='int32', value=2) @@ -73,6 +76,8 @@ def test_return_var_tuple(self): return 3, 2 """ + paddle.enable_static() + def true_func(): return layers.fill_constant(shape=[1, 2], dtype='int32', value=1), layers.fill_constant( @@ -114,6 +119,8 @@ def test_pass_and_modify_var(self): a = a - (i - 1) """ + paddle.enable_static() + def true_func(a, i): a = a * (i + 1) return a @@ -152,6 +159,8 @@ def test_return_none(self): pass """ + paddle.enable_static() + def true_func(): pass @@ -181,6 +190,8 @@ def test_wrong_structure_exception(self): test returning different number of tensors cannot merge into output """ + paddle.enable_static() + def func_return_none(): return None @@ -223,10 +234,11 @@ def func_return_two_tensors(): out = layers.cond(pred, func_return_one_tensor, func_return_two_tensors) self.assertTrue( - "Incompatible return values of true_fn and false_fn in cond" in - str(e.exception)) + "true fn returns 1 vars, but false fn returns 2 vars, which is not equals" + in str(e.exception)) def test_extremely_simple_net_with_op_in_condition(self): + paddle.enable_static() main_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(main_program, startup_program): @@ -272,6 +284,8 @@ def test_cond_inside_cond(self): return a / a """ + paddle.enable_static() + def less_than_branch(i, a): return layers.cond(i >= 3.0, lambda: layers.elementwise_add(a, a), lambda: layers.elementwise_sub(a, a)) @@ -308,6 +322,7 @@ def greater_equal_branch(i, a): self.assertEqual(ret[1][0], expected_a_grad) def test_cond_op_in_condition(self): + paddle.enable_static() main_program = fluid.Program() startup_program = fluid.Program() @@ -344,6 +359,7 @@ def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe): """ Helper function that compares calculated backward value is close to dy/dx """ + paddle.enable_static() main_program = Program() main_program.random_seed = 123 startup_program = Program() @@ -474,6 +490,8 @@ def add_optimizer_helper(self, cond_func, use_cuda, use_parallel_exe): def test_cond_backward(self): + paddle.enable_static() + def cond_func(i, img, label): predicate = ((i % 2) == 0) return layers.cond( @@ -494,6 +512,7 @@ def cond_func(i, img, label): use_parallel_exe) def test_half_nested_cond_backward(self): + paddle.enable_static() def branch(i, img, label): return layers.cond( @@ -530,6 +549,7 @@ def cond_func_simple_net_at_false(i, img, label): use_parallel_exe) def test_nested_cond_backward(self): + paddle.enable_static() def branch(i, img, label, mod_two): if mod_two: @@ -560,6 +580,7 @@ def cond_func(i, img, label): class TestCondWithError(unittest.TestCase): def test_input_type_error(self): + paddle.enable_static() main_program = framework.Program() startup_program = framework.Program() with framework.program_guard(main_program, startup_program): From f39183ea671bcef955b2296731db882601111548 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Wed, 6 Jul 2022 20:39:45 +0800 Subject: [PATCH 084/250] =?UTF-8?q?=E3=80=90Paddle-Inference=E3=80=91=20fi?= =?UTF-8?q?x=20nvcc=5Flazy=20=20(#44114)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * nvcc_lazy * nvcc_lazy * conv_fusion --- cmake/experiments/cuda_module_loading_lazy.cmake | 10 +++++----- tools/nvcc_lazy | 8 +++++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake index 0f0793a8ee32b..bcbfaacad1240 100644 --- a/cmake/experiments/cuda_module_loading_lazy.cmake +++ b/cmake/experiments/cuda_module_loading_lazy.cmake @@ -13,8 +13,8 @@ # limitations under the License. # this file contains experimental build options for lazy cuda module loading -# cuda moduel lazy loading is supported by CUDA 11.6+ -# this experiment option makes Paddle supports lazy loading before CUDA 11.6. +# cuda moduel lazy loading is supported by CUDA 11.7+ +# this experiment option makes Paddle supports lazy loading before CUDA 11.7. option(EXP_CUDA_MODULE_LOADING_LAZY "enable lazy cuda module loading" OFF) if(${EXP_CUDA_MODULE_LOADING_LAZY}) @@ -28,13 +28,13 @@ if(${EXP_CUDA_MODULE_LOADING_LAZY}) message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA") return() endif() - if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.6") - message("cuda 11.6+ already support lazy module loading") + if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.7") + message("cuda 11.7+ already support lazy module loading") return() endif() message( - "for cuda before 11.6, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a" + "for cuda before 11.7, libcudart.so must be used for the lazy module loading trick to work, instead of libcudart_static.a" ) set(CUDA_USE_STATIC_CUDA_RUNTIME OFF diff --git a/tools/nvcc_lazy b/tools/nvcc_lazy index 9cb49b04ffaff..e3e7e361021c2 100755 --- a/tools/nvcc_lazy +++ b/tools/nvcc_lazy @@ -1,4 +1,6 @@ #!/usr/bin/env bash +unset GREP_OPTIONS +set -e # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # @@ -17,11 +19,11 @@ ## CUDA_MODULE_LOADING=EAGER,DEFAULT,LAZY -# check nvcc version, if nvcc >= 11.6, just run nvcc itself -CUDA_VERSION=$(nvcc --version | grep -oP '(?<=cuda_)\d*\.\d*') +# check nvcc version, if nvcc >= 11.7, just run nvcc itself +CUDA_VERSION=$(nvcc --version | grep -oP '(?<=V)\d*\.\d*') CUDA_VERSION_MAJOR=${CUDA_VERSION%.*} CUDA_VERSION_MINOR=${CUDA_VERSION#*.} -if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 6) )); then +if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 7) )); then nvcc "$@" exit fi From 48abaec6d9998075ab0141c35c5411ab48f292a9 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Wed, 6 Jul 2022 16:12:31 +0200 Subject: [PATCH 085/250] Performance fix for recommender model (#43803) * fix for binary kernels * fixed performance for elementwise, reduce and concat * added comment * CI fix * CI fix * added formatting * reverted one file * Revert "reverted one file" This reverts commit 54725e1c62318d3a18913821200e973816751019. * Revert "added formatting" This reverts commit b9795dd253d755a329376d7ab0542860aa7815c6. * added enforcing oneDNN BF16 reduce kernel * fix for eltwise and reenabled reshape kernels * fix for binary handler * added formatting * referted changes for flatten,squeeze and reshape ops --- .../mkldnn/elementwise_mkldnn_op.h | 17 +++++++-- .../operators/mkldnn/concat_mkldnn_op.cc | 19 +++++++++- paddle/fluid/operators/reduce_ops/reduce_op.h | 35 ++++++++++++++++++- paddle/fluid/platform/mkldnn_reuse.h | 9 +++-- .../mkldnn/test_elementwise_add_mkldnn_op.py | 8 +++++ 5 files changed, 82 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 61552e492dfa1..7f6566460ab62 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -75,8 +75,8 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); auto* z = ctx.Output("Out"); float scale_x = ctx.Attr("Scale_x"); @@ -96,6 +96,12 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { scale_o, get_post_ops(ctx)); + // oneDNN's binary is optimized for broadcasting y into x, so in other case + // we have to swap tensors to achieve optimal performance + if (x->numel() < y->numel()) { + std::swap(x, y); + } + const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); // (jczaja) For Inplace src and dst should be the same memory object. @@ -159,6 +165,13 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel { auto* dy = ctx.Output(framework::GradVarName("Y")); auto* dout = ctx.Input(framework::GradVarName("Out")); + // oneDNN's binary is optimized for broadcasting y into x, so in other case + // we have to swap tensors to achieve optimal performance + if (x->numel() < y->numel()) { + std::swap(x, y); + std::swap(dx, dy); + } + int axis = ctx.Attr("axis"); auto tz = phi::vectorize(dout->dims()); diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index cefa4fc1b995b..837d4357737a2 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -77,7 +77,24 @@ class ConcatMKLDNNHandler } auto dst_dims = phi::vectorize(output->dims()); - auto dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any); + + dnnl::memory::desc dst_md; + + // if concat is being used as a stack op(all source memories dims on + // concat_axis are equal to 1), then it may choose a non-optimal memory + // format tag for destination, because concat primitive is chosing it based + // on source memory descriptors and f.e.200x1x10 can be described as both + // abc and bac and both would be using exact same physical layout, but in + // that scenario bac will be chosen for destination no matter which + // formats are being set in inputs. In that scenario we are enforcing using + // a dense format, because it is the most common one and should be the best + // in terms of the performance + if (dst_dims[concat_axis] == static_cast(srcs_md.size())) { + dst_md = memory::desc( + dst_dims, dt, platform::GetPlainMKLDNNFormat(dst_dims.size())); + } else { + dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any); + } this->AcquireForwardPrimitiveDescriptor(dst_md, concat_axis, srcs_md); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index ec3cf1908c5b5..e9bc3905a22ee 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -545,6 +545,38 @@ class ReduceOp : public framework::OperatorWithKernel { } } + // oneDNN's reduction kernel is optimized only for reducing throughout the + // most outer dims, so in case of another type of reduction, it would be + // better to fallback to native implementation + static bool HasOptimizedOneDNNKernel(const framework::ExecutionContext& ctx) { + // native reduce kernels don't support bf16 + // so oneDNN kernel is enforced in that case + if (ctx.Input("X")->dtype() == + experimental::DataType::BFLOAT16) + return true; + + auto reduce_dims = ctx.Attr>("dim"); + const bool reduce_all = ctx.Attr("reduce_all"); + int ndims = ctx.Input("X")->dims().size(); + + if (reduce_all) { + return true; + } + + for (size_t i = 0; i < reduce_dims.size(); ++i) { + if (reduce_dims[i] < 0) reduce_dims[i] = ndims + reduce_dims[i]; + } + sort(reduce_dims.begin(), reduce_dims.end()); + for (size_t i = 0; i < reduce_dims.size(); ++i) { + if (reduce_dims[reduce_dims.size() - i - 1] != + static_cast(ndims - i - 1)) { + return false; + } + } + + return true; + } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. @@ -554,7 +586,8 @@ class ReduceOp : public framework::OperatorWithKernel { return framework::OpKernelType(input_data_type, ctx.GetPlace()); #ifdef PADDLE_WITH_MKLDNN - if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + if (this->CanMKLDNNBeUsed(ctx, input_data_type) && + HasOptimizedOneDNNKernel(ctx)) { return framework::OpKernelType(input_data_type, ctx.GetPlace(), framework::DataLayout::kMKLDNN, diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 41a4f551cedc1..2f4bbfaf74fcc 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -690,8 +690,13 @@ class BinaryMKLDNNHandler auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops); - this->AcquireForwardPrimitiveDescriptor( - attributes, algo, src0_md, src1_md, dst_md); + if (x->numel() < y->numel()) { + this->AcquireForwardPrimitiveDescriptor( + attributes, algo, src1_md, src0_md, dst_md); + } else { + this->AcquireForwardPrimitiveDescriptor( + attributes, algo, src0_md, src1_md, dst_md); + } } std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py index 2ae717d64a302..dc9a3862e0421 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py @@ -68,6 +68,14 @@ def init_input_output(self): self.out = np.add(self.x, self.y) +class TestMKLDNNElementwiseAddOpBroadcastXintoY(TestMKLDNNElementwiseAddOp): + + def init_input_output(self): + self.x = np.random.uniform(1, 2, [2, 50, 1]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [2, 50, 160]).astype(self.dtype) + self.out = np.add(self.x, self.y) + + class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp): def init_input_output(self): From 1e6137b5042389f559eda13fde2f806fa8f5160a Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Thu, 7 Jul 2022 10:42:35 +0800 Subject: [PATCH 086/250] add resnet_basic_block for kunlun, test=kunlun (#43949) --- paddle/fluid/operators/fused/CMakeLists.txt | 7 +- .../operators/fused/resnet_basic_block_op.cc | 576 ++++++++++++++++++ paddle/fluid/pybind/op_function_generator.h | 25 + 3 files changed, 607 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/fused/resnet_basic_block_op.cc diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 4ffb96d3c51bc..dfbdaed87614f 100755 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -26,12 +26,17 @@ register_operators( fused_bias_dropout_residual_layer_norm_op resnet_unit_op fused_gemm_epilogue_op - fused_gate_attention_op) + fused_gate_attention_op + resnet_basic_block_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) op_library(fusion_lstm_op) +if(WITH_XPU) + op_library(resnet_basic_block_op) +endif() + if(WITH_GPU OR WITH_ROCM) # fused_bn_activation_op needs cudnn 7.4.1 above # HIP not support bn act fuse in MIOPEN diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op.cc b/paddle/fluid/operators/fused/resnet_basic_block_op.cc new file mode 100644 index 0000000000000..d54a889f93aa6 --- /dev/null +++ b/paddle/fluid/operators/fused/resnet_basic_block_op.cc @@ -0,0 +1,576 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/api/all.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +class ResNetBasicBlockOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const { + // Check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Filter1"), "Input", "Filter1", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Scale1"), "Input", "Scale1", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Bias1"), "Input", "Bias1", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Mean1"), "Input", "Mean1", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Var1"), "Input", "Var1", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Filter2"), "Input", "Filter2", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Scale2"), "Input", "Scale2", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Bias2"), "Input", "Bias2", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Mean2"), "Input", "Mean2", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Var2"), "Input", "Var2", "ResNetBasicBlockOp"); + + bool has_shortcut = ctx->Attrs().Get("has_shortcut"); + if (has_shortcut) { + OP_INOUT_CHECK( + ctx->HasInput("Filter3"), "Input", "Filter3", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Scale3"), "Input", "Scale3", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Bias3"), "Input", "Bias3", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Mean3"), "Input", "Mean3", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasInput("Var3"), "Input", "Var3", "ResNetBasicBlockOp"); + } + + // Check output + OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasOutput("Conv1"), "Output", "Conv1", "ResNetBasicBlockOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedMean1"), + "Output", + "SavedMean1", + "ResNetBasicBlockOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedInvstd1"), + "Output", + "SavedInvstd1", + "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasOutput("Mean1Out"), "Output", "Mean1Out", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasOutput("Var1Out"), "Output", "Var1Out", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasOutput("Conv2"), "Output", "Conv2", "ResNetBasicBlockOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedMean2"), + "Output", + "SavedMean2", + "ResNetBasicBlockOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedInvstd2"), + "Output", + "SavedInvstd2", + "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasOutput("Mean2Out"), "Output", "Mean2Out", "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasOutput("Var2Out"), "Output", "Var2Out", "ResNetBasicBlockOp"); + if (has_shortcut) { + OP_INOUT_CHECK( + ctx->HasOutput("Conv3"), "Output", "Conv3", "ResNetBasicBlockOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedMean3"), + "Output", + "SavedMean3", + "ResNetBasicBlockOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedInvstd3"), + "Output", + "SavedInvstd3", + "ResNetBasicBlockOp"); + OP_INOUT_CHECK(ctx->HasOutput("Mean3Out"), + "Output", + "Mean3Out", + "ResNetBasicBlockOp"); + OP_INOUT_CHECK( + ctx->HasOutput("Var3Out"), "Output", "Var3Out", "ResNetBasicBlockOp"); + } + + // make sure Mean/RunningMean and Var/RunningVar share memory + PADDLE_ENFORCE_EQ(ctx->Inputs("Mean1")[0], + ctx->Outputs("Mean1Out")[0], + platform::errors::InvalidArgument( + "Mean1 and Mean1Out should share the same memory")); + PADDLE_ENFORCE_EQ(ctx->Inputs("Var1")[0], + ctx->Outputs("Var1Out")[0], + platform::errors::InvalidArgument( + "Var1 and Var1Out should share the same memory")); + PADDLE_ENFORCE_EQ(ctx->Inputs("Mean2")[0], + ctx->Outputs("Mean2Out")[0], + platform::errors::InvalidArgument( + "Mean2 and Mean2Out should share the same memory")); + PADDLE_ENFORCE_EQ(ctx->Inputs("Var2")[0], + ctx->Outputs("Var2Out")[0], + platform::errors::InvalidArgument( + "Var2 and Var2Out should share the same memory")); + + if (has_shortcut) { + PADDLE_ENFORCE_EQ(ctx->Inputs("Mean3")[0], + ctx->Outputs("Mean3Out")[0], + platform::errors::InvalidArgument( + "Mean3 and Mean3Out should share the same memory")); + PADDLE_ENFORCE_EQ(ctx->Inputs("Var3")[0], + ctx->Outputs("Var3Out")[0], + platform::errors::InvalidArgument( + "Var3 and Var3Out should share the same memory")); + } + + // Check dims of inputs + auto data_format = ctx->Attrs().Get("data_format"); + PADDLE_ENFORCE_EQ( + data_format, + "NCHW", + platform::errors::InvalidArgument("The data format must equal to NCHW. " + "But received: the data format " + "= [%s]", + data_format)); + int stride1 = ctx->Attrs().Get("stride1"); + int stride2 = ctx->Attrs().Get("stride2"); + int padding1 = ctx->Attrs().Get("padding1"); + int padding2 = ctx->Attrs().Get("padding2"); + + const auto x1_dims = ctx->GetInputDim("X"); + const auto w1_dims = ctx->GetInputDim("Filter1"); + const auto bn1_param_dims = ctx->GetInputDim("Scale1"); + PADDLE_ENFORCE_EQ( + x1_dims.size(), + 4, + platform::errors::InvalidArgument("The dimensions of input " + "must equal to 4." + "But received: the shape of input " + "= [%s], the dimension of input = " + "[%d]", + x1_dims, + x1_dims.size())); + + // Calculate the dims of output1 + int batch = x1_dims[0]; + int output1_channel = w1_dims[0]; + int filter1_size = w1_dims[2]; + int out1_h = (x1_dims[2] + padding1 * 2 - filter1_size) / stride1 + 1; + int out1_w = (x1_dims[3] + padding1 * 2 - filter1_size) / stride1 + 1; + std::vector out1_shape = {batch, output1_channel, out1_h, out1_w}; + + const auto w2_dims = ctx->GetInputDim("Filter2"); + const auto bn2_param_dims = ctx->GetInputDim("Scale2"); + int output2_channel = w2_dims[0]; + int filter2_size = w2_dims[2]; + int out2_h = (out1_h + padding2 * 2 - filter2_size) / stride2 + 1; + int out2_w = (out1_w + padding2 * 2 - filter2_size) / stride2 + 1; + std::vector out2_shape = {batch, output2_channel, out2_h, out2_w}; + + auto y_dims = phi::make_ddim(out2_shape); + auto conv1_dims = phi::make_ddim(out1_shape); + ctx->SetOutputDim("Y", y_dims); + ctx->SetOutputDim("Conv1", conv1_dims); + ctx->SetOutputDim("SavedMean1", bn1_param_dims); + ctx->SetOutputDim("SavedInvstd1", bn1_param_dims); + ctx->SetOutputDim("Mean1Out", bn1_param_dims); + ctx->SetOutputDim("Var1Out", bn1_param_dims); + ctx->SetOutputDim("Conv2", y_dims); + ctx->SetOutputDim("Conv2Input", conv1_dims); + ctx->SetOutputDim("SavedMean2", bn2_param_dims); + ctx->SetOutputDim("SavedInvstd2", bn2_param_dims); + ctx->SetOutputDim("Mean2Out", bn2_param_dims); + ctx->SetOutputDim("Var2Out", bn2_param_dims); + if (has_shortcut) { + ctx->SetOutputDim("Conv3", y_dims); + ctx->SetOutputDim("SavedMean3", bn2_param_dims); + ctx->SetOutputDim("SavedInvstd3", bn2_param_dims); + ctx->SetOutputDim("Mean3Out", bn2_param_dims); + ctx->SetOutputDim("Var3Out", bn2_param_dims); + } + + bool find_max = ctx->Attrs().Get("find_conv_input_max"); + if (find_max) { + auto max_dims = phi::make_ddim({6}); + ctx->SetOutputDim("MaxInput1", max_dims); + ctx->SetOutputDim("MaxFilter1", max_dims); + ctx->SetOutputDim("MaxInput2", max_dims); + ctx->SetOutputDim("MaxFilter2", max_dims); + if (has_shortcut) { + ctx->SetOutputDim("MaxInput3", max_dims); + ctx->SetOutputDim("MaxFilter3", max_dims); + } + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + + // By default, the type of the scale, bias, mean, + // and var tensors should be float when input tensor's dtype is float16. + auto bn_param_type = framework::proto::VarType::FP32; + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Scale1")->dtype()), + platform::errors::InvalidArgument( + "Scale input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Bias1")->dtype()), + platform::errors::InvalidArgument( + "Bias input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Scale2")->dtype()), + platform::errors::InvalidArgument( + "Scale input should be of float type")); + PADDLE_ENFORCE_EQ( + bn_param_type, + framework::TransToProtoVarType(ctx.Input("Bias2")->dtype()), + platform::errors::InvalidArgument( + "Bias input should be of float type")); + + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType( + input_data_type, ctx.GetPlace(), layout, library); + } +}; + +class ResNetBasicBlockOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + // has_shortcut = True: X else: X + // / / + // | | | | + // CONV1 | CONV1 | + // | | | | + // BN1 | BN1 | + // | | | | + // RELU1 | RELU1 | + // | | | | + // CONV2 CONV3 CONV2 | + // | | | | + // BN2 BN3 BN2 | + // \ / \ / + // ADD ADD + // | | + // RELU RELU + // | | + // Y Y + AddInput("X", "Input tensor of conv 1"); + AddInput("Filter1", "Filter tensor of conv 1"); + AddInput("Scale1", "Scale tensor of bn 1"); + AddInput("Bias1", "Bias tensor of bn 1"); + AddInput("Mean1", "Mean tensor of bn 1"); + AddInput("Var1", "Variance tensor of bn 1"); + AddInput("Filter2", "Filter tensor of conv 2"); + AddInput("Scale2", "Scale tensor of bn 2"); + AddInput("Bias2", "Bias tensor of bn 2"); + AddInput("Mean2", "Mean tensor of bn 2"); + AddInput("Var2", "Variance tensor of bn 2"); + AddInput("Filter3", "Filter tensor of conv 3").AsDispensable(); + AddInput("Scale3", "Scale tensor of bn 3").AsDispensable(); + AddInput("Bias3", "Bias tensor of bn 3").AsDispensable(); + AddInput("Mean3", "Mean tensor of bn 3").AsDispensable(); + AddInput("Var3", "Variance tensor of bn 3").AsDispensable(); + AddOutput("Y", "The result of ssd resnet unit"); + AddOutput("Conv1", "The result of conv 1"); + AddOutput("SavedMean1", "Mean of input 1 after conv 1"); + AddOutput("SavedInvstd1", "Invstd of input 1 after conv 1"); + AddOutput("Mean1Out", "Shared memory with Mean1"); + AddOutput("Var1Out", "Shared memory with Var1"); + AddOutput("Conv2", "The result of conv 2"); + AddOutput("Conv2Input", "Conv2 input data"); + AddOutput("SavedMean2", "Mean of input 2 after conv 2"); + AddOutput("SavedInvstd2", "Invstd of input 2 after conv 2"); + AddOutput("Mean2Out", "Shared memory with Mean2"); + AddOutput("Var2Out", "Shared memory with Var2"); + AddOutput("Conv3", "The result of conv 3").AsDispensable(); + AddOutput("SavedMean3", "Mean of input 3 after conv 3").AsDispensable(); + AddOutput("SavedInvstd3", "Invstd of input 3 after conv 3").AsDispensable(); + AddOutput("Mean3Out", "Shared memory with Mean3").AsDispensable(); + AddOutput("Var3Out", "Shared memory with Var3").AsDispensable(); + AddOutput("MaxInput1", "The max value of conv1 input tensor") + .AsDispensable(); + AddOutput("MaxFilter1", "The max value of conv1 filter tensor") + .AsDispensable(); + AddOutput("MaxInput2", "The max value of conv2 input tensor") + .AsDispensable(); + AddOutput("MaxFilter2", "The max value of conv2 filter tensor") + .AsDispensable(); + AddOutput("MaxInput3", "The max value of conv3 input tensor") + .AsDispensable(); + AddOutput("MaxFilter3", "The max value of conv3 filter tensor") + .AsDispensable(); + AddAttr("stride1", "Stride of conv1").SetDefault(1); + AddAttr("stride2", "Stride of conv2").SetDefault(1); + AddAttr("stride3", "Stride of conv3").SetDefault(1); + AddAttr("padding1", "Padding of conv1").SetDefault(0); + AddAttr("padding2", "Padding of conv2").SetDefault(0); + AddAttr("padding3", "Padding of conv3").SetDefault(0); + AddAttr("dilation1", "Dilation of conv1").SetDefault(1); + AddAttr("dilation2", "Dilation of conv2").SetDefault(1); + AddAttr("dilation3", "Dilation of conv3").SetDefault(1); + AddAttr("group", "Group of all the 3 conv").SetDefault(1); + AddAttr("momentum", "Momentum of all the 3 bn").SetDefault(0.9); + AddAttr("epsilon", "Epsilon of all the 3 bn").SetDefault(1e-5); + AddAttr("data_format", "").SetDefault("NCHW"); + AddAttr("has_shortcut", "").SetDefault(false); + AddAttr("use_global_stats", "").SetDefault(false); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddAttr( + "trainable_statistics", + "(bool, default false) Whether to calculate mean and variance " + "in test mode. If setting true in test mode, mean and variace " + "will be calculated by current batch statistics.") + .SetDefault(false); + AddAttr("act_type", "The activation type to be fused.") + .SetDefault("relu"); + AddAttr("find_conv_input_max", + "(bool, default true) Whether to calculate max value of conv " + "input tensor.") + .SetDefault(true); + AddComment(R"DOC( +Fusion op of the basic unit of ssd resnet block. +** This is only use for XPU, if has problems, concat zhangyikun02@baidu.com ** +)DOC"); + } +}; + +template +class ResNetBasicBlockGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("resnet_basic_block_grad"); + op->SetInput("X", this->Input("X")); + op->SetInput("Filter1", this->Input("Filter1")); + op->SetInput("Conv1", this->Output("Conv1")); + op->SetInput("Scale1", this->Input("Scale1")); + op->SetInput("Bias1", this->Input("Bias1")); + op->SetInput("SavedMean1", this->Output("SavedMean1")); + op->SetInput("SavedInvstd1", this->Output("SavedInvstd1")); + op->SetInput("Filter2", this->Input("Filter2")); + op->SetInput("Conv2", this->Output("Conv2")); + op->SetInput("Conv2Input", this->Output("Conv2Input")); + op->SetInput("Scale2", this->Input("Scale2")); + op->SetInput("Bias2", this->Input("Bias2")); + op->SetInput("SavedMean2", this->Output("SavedMean2")); + op->SetInput("SavedInvstd2", this->Output("SavedInvstd2")); + op->SetInput("Filter3", this->Input("Filter3")); + op->SetInput("Conv3", this->Output("Conv3")); + op->SetInput("Scale3", this->Input("Scale3")); + op->SetInput("Bias3", this->Input("Bias3")); + op->SetInput("SavedMean3", this->Output("SavedMean3")); + op->SetInput("SavedInvstd3", this->Output("SavedInvstd3")); + op->SetInput("MaxInput1", this->Output("MaxInput1")); + op->SetInput("MaxFilter1", this->Output("MaxFilter1")); + op->SetInput("MaxInput2", this->Output("MaxInput2")); + op->SetInput("MaxFilter2", this->Output("MaxFilter2")); + op->SetInput("MaxInput3", this->Output("MaxInput3")); + op->SetInput("MaxFilter3", this->Output("MaxFilter3")); + op->SetInput("Y", this->Output("Y")); + op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); + + op->SetAttrMap(this->Attrs()); + + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetOutput(framework::GradVarName("Filter1"), + this->InputGrad("Filter1")); + op->SetOutput(framework::GradVarName("Scale1"), this->InputGrad("Scale1")); + op->SetOutput(framework::GradVarName("Bias1"), this->InputGrad("Bias1")); + op->SetOutput(framework::GradVarName("Filter2"), + this->InputGrad("Filter2")); + op->SetOutput(framework::GradVarName("Scale2"), this->InputGrad("Scale2")); + op->SetOutput(framework::GradVarName("Bias2"), this->InputGrad("Bias2")); + op->SetOutput(framework::GradVarName("Filter3"), + this->InputGrad("Filter3")); + op->SetOutput(framework::GradVarName("Scale3"), this->InputGrad("Scale3")); + op->SetOutput(framework::GradVarName("Bias3"), this->InputGrad("Bias3")); + } +}; + +class ResNetBasicBlockOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map& GetInputOutputWithSameType() + const override { + static std::unordered_map m{{"X", /*->*/ "Y"}}; + return m; + } +}; + +class ResNetBasicBlockGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const { + // check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK( + ctx->HasInput("Filter1"), "Input", "Filter1", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK( + ctx->HasInput("Conv1"), "Input", "Conv1", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK( + ctx->HasInput("Scale1"), "Input", "Scale1", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK( + ctx->HasInput("Bias1"), "Input", "Bias1", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedMean1"), + "Input", + "SavedMean1", + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedInvstd1"), + "Input", + "SavedInvstd1", + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK( + ctx->HasInput("Filter2"), "Input", "Filter2", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK( + ctx->HasInput("Conv2"), "Input", "Conv2", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK( + ctx->HasInput("Scale2"), "Input", "Scale2", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK( + ctx->HasInput("Bias2"), "Input", "Bias2", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedMean2"), + "Input", + "SavedMean2", + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedInvstd2"), + "Input", + "SavedInvstd2", + "ResNetBasicBlockGradOp"); + bool has_shortcut = ctx->Attrs().Get("has_shortcut"); + if (has_shortcut) { + OP_INOUT_CHECK(ctx->HasInput("Filter3"), + "Input", + "Filter3", + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK( + ctx->HasInput("Scale3"), "Input", "Scale3", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK( + ctx->HasInput("Bias3"), "Input", "Bias3", "ResNetBasicBlockGradOp"); + } + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), + "Input", + framework::GradVarName("Y"), + "ResNetBasicBlockGradOp"); + + // check output + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter1")), + "Output", + framework::GradVarName("Filter1"), + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale1")), + "Output", + framework::GradVarName("Scale1"), + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias1")), + "Output", + framework::GradVarName("Bias1"), + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter2")), + "Output", + framework::GradVarName("Filter2"), + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale2")), + "Output", + framework::GradVarName("Scale2"), + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias2")), + "Output", + framework::GradVarName("Bias2"), + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), + "Output", + framework::GradVarName("X"), + "ResNetBasicBlockGradOp"); + if (has_shortcut) { + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter3")), + "Output", + framework::GradVarName("Filter3"), + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale3")), + "Output", + framework::GradVarName("Scale3"), + "ResNetBasicBlockGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias3")), + "Output", + framework::GradVarName("Bias3"), + "ResNetBasicBlockGradOp"); + } + + const auto x1_dims = ctx->GetInputDim("X"); + const auto filter1_x_dims = ctx->GetInputDim("Filter1"); + const auto param1_dims = ctx->GetInputDim("Scale1"); + const auto filter2_x_dims = ctx->GetInputDim("Filter2"); + const auto param2_dims = ctx->GetInputDim("Scale2"); + ctx->SetOutputDim(framework::GradVarName("X"), x1_dims); + ctx->SetOutputDim(framework::GradVarName("Filter1"), filter1_x_dims); + ctx->SetOutputDim(framework::GradVarName("Scale1"), param1_dims); + ctx->SetOutputDim(framework::GradVarName("Bias1"), param1_dims); + ctx->SetOutputDim(framework::GradVarName("Filter2"), filter2_x_dims); + ctx->SetOutputDim(framework::GradVarName("Scale2"), param2_dims); + ctx->SetOutputDim(framework::GradVarName("Bias2"), param2_dims); + if (has_shortcut) { + const auto filter_z_dims = ctx->GetInputDim("Filter3"); + ctx->SetOutputDim(framework::GradVarName("Filter3"), filter_z_dims); + ctx->SetOutputDim(framework::GradVarName("Scale3"), param2_dims); + ctx->SetOutputDim(framework::GradVarName("Bias3"), param2_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + PADDLE_ENFORCE_NOT_NULL( + ctx.InputVar(framework::GradVarName("Y")), + platform::errors::NotFound( + "Can not find Y@GRAD in the execution context.")); + + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), + ctx.GetPlace(), + layout, + library); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(resnet_basic_block, + ops::ResNetBasicBlockOp, + ops::ResNetBasicBlockOpMaker, + ops::ResNetBasicBlockOpInferVarType, + ops::ResNetBasicBlockGradOpMaker, + ops::ResNetBasicBlockGradOpMaker); +REGISTER_OPERATOR(resnet_basic_block_grad, ops::ResNetBasicBlockGradOp); diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 4441c06bca2cf..590d9d2f83e8b 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -208,6 +208,23 @@ std::map> op_ins_map = { {"trilinear_interp", {"X", "OutSize"}}, {"nearest_interp", {"X", "OutSize"}}, {"bicubic_interp", {"X", "OutSize"}}, + {"resnet_basic_block", + {"X", + "Filter1", + "Scale1", + "Bias1", + "Mean1", + "Var1", + "Filter2", + "Scale2", + "Bias2", + "Mean2", + "Var2", + "Filter3", + "Scale3", + "Bias3", + "Mean3", + "Var3"}}, }; // NOTE(zhiqiu): Like op_ins_map. @@ -309,6 +326,12 @@ std::map> op_outs_map = { "Beta2PowOut", "MasterParamOut"}}, {"fused_multi_transformer", {"CacheKVOut", "Out"}}, + {"resnet_basic_block", + {"Y", "Conv1", "SavedMean1", "SavedInvstd1", "Mean1Out", + "Var1Out", "Conv2", "SavedMean2", "SavedInvstd2", "Mean2Out", + "Var2Out", "Conv3", "SavedMean3", "SavedInvstd3", "Mean3Out", + "Var3Out", "MaxInput1", "MaxFilter1", "MaxInput2", "MaxFilter2", + "MaxInput3", "MaxFilter3"}}, }; // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are @@ -408,6 +431,8 @@ std::map> op_passing_outs_map = { {"concat", {"Out"}}, {"fused_multi_transformer", {"CacheKVOut"}}, {"group_norm", {"Mean", "Variance"}}, + {"resnet_basic_block", + {"Mean1Out", "Var1Out", "Mean2Out", "Var2Out", "Mean3Out", "Var3Out"}}, }; // NOTE(pangyoki): Tensor View Strategy. From aa0c885a0509b4a4ee236b501e0c968d275baa01 Mon Sep 17 00:00:00 2001 From: shixingbo <90814748+bmb0537@users.noreply.github.com> Date: Thu, 7 Jul 2022 11:05:01 +0800 Subject: [PATCH 087/250] Optimized the performance of broadcast for kp XPU2 (#44091) --- paddle/phi/kernels/funcs/elementwise_base.h | 24 +++++---------- .../primitive/datamover_primitives_xpu2.h | 30 +++++++++++++++++-- 2 files changed, 35 insertions(+), 19 deletions(-) mode change 100644 => 100755 paddle/phi/kernels/funcs/elementwise_base.h mode change 100644 => 100755 paddle/phi/kernels/primitive/datamover_primitives_xpu2.h diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h old mode 100644 new mode 100755 index 3e68462c88a5c..ddbbe4b1718f1 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -558,6 +558,9 @@ struct VecSizeGetter { template int GetVectorizedSizeForTensors(const std::vector &ins, const std::vector &outs) { +#ifdef PADDLE_WITH_XPU_KP + int vec_size = 256; +#else using Traits = paddle::platform::FunctionTraits; using ArgsT = typename Traits::ArgsTuple; const int Arity = Traits::arity; @@ -569,6 +572,7 @@ int GetVectorizedSizeForTensors(const std::vector &ins, vec_size = std::min(vec_size, phi::GetVectorizedSize((*iter)->data())); } +#endif return vec_size; } @@ -784,7 +788,6 @@ template void LaunchElementwiseCudaKernel(const KPDevice &ctx, const std::vector &ins, std::vector *outs, - int read_lens, Functor func) { // There are at least 1 output, but maybe 0 input (ins.size() == 0). // For large tensor numel * sizeof(T) > 2^31, we must use int64_t as index @@ -800,6 +803,7 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx, #ifdef PADDLE_WITH_XPU_KP int block_size = 64; int grid_size = 8; + int read_lens = kps::details::GetXpuReadLens(numel, block_size, grid_size); auto stream = ctx.x_context()->xpu_stream; int64_t main_offset = (numel / (read_lens * block_size)) * read_lens * block_size; @@ -853,32 +857,20 @@ void ElementwiseKernel(const KPDevice &ctx, } } -#ifdef PADDLE_WITH_XPU_KP - const int buf_size = 256; - int numel = (*outs)[0]->numel(); - int block_size = 64; - int grid_size = 8; - int nthreads = block_size * grid_size; - int read_lens = - std::min(buf_size, kps::details::RoundUpDiv(numel, 32 * nthreads) * 32); - int vec_size = buf_size; -#else // calculate the max vec_size for all ins and outs int vec_size = GetVectorizedSizeForTensors(ins, *outs); - int read_lens = vec_size; -#endif switch (vec_size) { case VecSizeL: LaunchElementwiseCudaKernel( - ctx, ins, outs, read_lens, func); + ctx, ins, outs, func); break; case VecSizeM: LaunchElementwiseCudaKernel( - ctx, ins, outs, read_lens, func); + ctx, ins, outs, func); break; case VecSizeS: LaunchElementwiseCudaKernel( - ctx, ins, outs, read_lens, func); + ctx, ins, outs, func); break; default: { PADDLE_THROW(phi::errors::Unimplemented( diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h old mode 100644 new mode 100755 index f2d187f89b252..68eb11bd6d0b9 --- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h @@ -21,7 +21,17 @@ namespace phi { namespace kps { namespace details { -int RoundUpDiv(int n, int k) { return (n + k - 1) / k; } +static inline int RoundUpDiv(int n, int k) { return (n + k - 1) / k; } + +static inline int GetXpuReadLens(int numel, int block_num, int grid_num) { + const int buf_size = 256; + int nthreads = block_num * grid_num; + if (numel / nthreads == 1) { + return numel / nthreads * 4; + } + int read_lens = std::min(buf_size, RoundUpDiv(numel, 32 * nthreads) * 32); + return read_lens; +} enum class OptType { // Optimize type of calc after input shape compressed CanNotOptimize = -1, // can not optimize, broadcast first @@ -98,8 +108,10 @@ struct BroadcastConfig { strides_out_tmp[i] = strides_out_tmp[i - 1] * out_dims[i - 1]; } + int numel_out = 1; for (int i = 0; i < dim_size; i++) { dim_tmp[i] = in_dims[i]; + numel_out = out_dims[i] * numel_out; } kDims = dim_size; memcpy(strides_in, strides_in_tmp.data(), kDims * sizeof(int)); @@ -108,13 +120,25 @@ struct BroadcastConfig { cmp_res = get_mnk_for_broadcast_ops(in_dims, y_in_dims); get_opt_type(); - buf_len = get_buf_len(); + buf_len = get_buf_len(numel_out); + int numel_x = 1; + int numel_y = 1; + for (int i = 0; i < dim_size; i++) { + numel_x = in_dims[i] * numel_x; + numel_y = y_in_dims[i] * numel_y; + } + if (numel_out == numel_x && numel_out == numel_y) { + buf_len = GetXpuReadLens(numel_out, 8, 64); + } } - int get_buf_len() { + int get_buf_len(int numel) { if (cmp_type == OptType::CanNotOptimize) { return 256; } + if (cmp_type == OptType::N_1) { + return kps::details::GetXpuReadLens(numel, 8, 64); + } int max_buf_len = 512; int buf_len = m / 16 * 16; if (buf_len == 0) { From a7c98ddbc48c4d28cdd89ffc92fd776b6af1396a Mon Sep 17 00:00:00 2001 From: Jiabin Yang <360788950@qq.com> Date: Thu, 7 Jul 2022 11:38:30 +0800 Subject: [PATCH 088/250] Fix higher order deriv with inplace (#44020) * fix deriv with inplace * fix double grad bugs * remove additional file * fix compat dygraph mode * fix yaml remove additional yaml * fix slice double grad error and auto code gen logic error for higher order differentiate * fix fix_higher_order_deriv * remove additional include * fix fix_higher_order_deriv --- .../eager_generated/backwards/CMakeLists.txt | 2 +- .../eager_generated/forwards/CMakeLists.txt | 2 +- paddle/fluid/eager/api/manual/CMakeLists.txt | 7 + .../api/manual/eager_manual/CMakeLists.txt | 8 + .../manual/eager_manual/dygraph_forward_api.h | 30 ++ .../eager_manual/forwards/CMakeLists.txt | 10 + .../forwards/conv2d_fwd_function.cc | 153 +++++++++ .../manual/eager_manual/nodes/CMakeLists.txt | 8 + .../manual/eager_manual/nodes/conv2d_nodes.cc | 308 ++++++++++++++++++ .../api/manual/eager_manual/nodes/nodes.h | 182 +++++++++++ .../final_state_generator/eager_gen.py | 211 ++++++------ paddle/fluid/eager/backward.cc | 54 +-- paddle/fluid/eager/grad_node_info.h | 6 + paddle/fluid/eager/grad_tensor_holder.cc | 16 +- paddle/fluid/eager/tensor_wrapper.h | 10 +- .../fluid/imperative/partial_grad_engine.cc | 16 + paddle/fluid/imperative/prepared_operator.h | 8 +- paddle/fluid/pybind/eager.cc | 5 +- paddle/fluid/pybind/eager_properties.cc | 1 + paddle/phi/api/lib/CMakeLists.txt | 4 +- paddle/phi/api/yaml/legacy_backward.yaml | 33 ++ 21 files changed, 933 insertions(+), 141 deletions(-) create mode 100644 paddle/fluid/eager/api/manual/eager_manual/CMakeLists.txt create mode 100644 paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h create mode 100644 paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt create mode 100644 paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc create mode 100644 paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt create mode 100644 paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc create mode 100644 paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt index f704d2a49184b..fbd552ef00da7 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt @@ -7,6 +7,6 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library( final_dygraph_node SRCS nodes.cc - DEPS ${eager_deps}) + DEPS ${eager_deps} ${eager_manual_nodes}) add_dependencies(final_dygraph_node eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt index 8d6df647999bd..66053baa5813b 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt @@ -7,6 +7,6 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library( final_dygraph_function SRCS dygraph_functions.cc - DEPS ${eager_deps}) + DEPS ${eager_deps} ${eager_manual_functions}) add_dependencies(final_dygraph_function eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/manual/CMakeLists.txt b/paddle/fluid/eager/api/manual/CMakeLists.txt index ebfcaad2eeac7..e6db90ccc5bbe 100644 --- a/paddle/fluid/eager/api/manual/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/CMakeLists.txt @@ -6,4 +6,11 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) set(fluid_manual_nodes ${fluid_manual_nodes} PARENT_SCOPE) + add_subdirectory(eager_manual) + set(eager_manual_functions + ${eager_manual_functions} + PARENT_SCOPE) + set(eager_manual_nodes + ${eager_manual_nodes} + PARENT_SCOPE) endif() diff --git a/paddle/fluid/eager/api/manual/eager_manual/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/CMakeLists.txt new file mode 100644 index 0000000000000..09420f368507d --- /dev/null +++ b/paddle/fluid/eager/api/manual/eager_manual/CMakeLists.txt @@ -0,0 +1,8 @@ +add_subdirectory(forwards) +add_subdirectory(nodes) +set(eager_manual_functions + ${eager_manual_functions} + PARENT_SCOPE) +set(eager_manual_nodes + ${eager_manual_nodes} + PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h new file mode 100644 index 0000000000000..0f06831068161 --- /dev/null +++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/api/include/tensor.h" + +paddle::experimental::Tensor conv2d_final_state_dygraph_function( + const paddle::experimental::Tensor& input, + const paddle::experimental::Tensor& filter, + std::vector strides, + std::vector paddings, + std::string paddding_algorithm, + int groups, + std::vector dilations, + std::string data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search); diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt new file mode 100644 index 0000000000000..0ed2f26c0b255 --- /dev/null +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt @@ -0,0 +1,10 @@ +cc_library( + conv2d_fwd_function + SRCS conv2d_fwd_function.cc + DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + +add_dependencies(conv2d_fwd_function eager_codegen) + +set(eager_manual_functions + conv2d_fwd_function + PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc new file mode 100644 index 0000000000000..f7bff6fb88997 --- /dev/null +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h" +#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/eager_amp_auto_cast.h" +#include "paddle/fluid/eager/nan_inf_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" + +#pragma GCC diagnostic ignored "-Wunused-variable" +DECLARE_bool(check_nan_inf); + +paddle::experimental::Tensor conv2d_final_state_dygraph_function( + const paddle::experimental::Tensor& input, + const paddle::experimental::Tensor& filter, + std::vector strides, + std::vector paddings, + std::string paddding_algorithm, + int groups, + std::vector dilations, + std::string data_format, + bool use_addto, + int workspace_size_MB, + bool exhaustive_search) { + // Dygraph Record Event + paddle::platform::RecordEvent dygraph_entrance_record_event( + "conv2d dygraph", paddle::platform::TracerEventType::Operator, 1); + + // AMP Logic + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + VLOG(5) << "Check and Prepare For AMP"; + auto op_name = phi::TransToFluidOpName("conv2d"); + paddle::small_vector, + egr::kSlotSmallVectorSize> + amp_tensors_vector = {{input}, {filter}}; + + auto amp_dst_dtype = egr::GetAmpDestDtype(op_name, amp_tensors_vector); + + auto NEW_input = + egr::EagerAmpAutoCast("input", input, amp_dst_dtype, op_name); + auto NEW_filter = + egr::EagerAmpAutoCast("filter", filter, amp_dst_dtype, op_name); + + { + paddle::imperative::AutoCastGuard guard( + egr::Controller::Instance().GetCurrentTracer(), + paddle::imperative::AmpLevel::O0); + return conv2d_final_state_dygraph_function(NEW_input, + NEW_filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search); + } + } + + // Get Input AutoGradMeta + egr::AutogradMeta* input_autograd_meta = + egr::EagerUtils::nullable_autograd_meta(input); + egr::AutogradMeta* filter_autograd_meta = + egr::EagerUtils::nullable_autograd_meta(filter); + // Forward API Call + VLOG(3) << "Final State Running: " + << "conv2d_final_state_dygraph_function"; + auto api_result = paddle::experimental::conv2d(input, + filter, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search); + // Check NaN and Inf if needed + if (FLAGS_check_nan_inf) { + egr::CheckTensorHasNanOrInf("conv2d", api_result); + } + + // Get Outputs + auto& out = api_result; + + // Get Output AutoGradMeta + egr::AutogradMeta* out_autograd_meta = egr::EagerUtils::autograd_meta(&out); + bool trace_backward = egr::Controller::Instance().HasGrad(); + bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( + trace_backward, input_autograd_meta, filter_autograd_meta); + + // Check Inplace if needed + + // Node Creation + if (require_any_grad) { + paddle::platform::RecordEvent node_creation_record_event( + "conv2d node_creation", + paddle::platform::TracerEventType::OperatorInner, + 1); + + egr::EagerUtils::PassStopGradient(false, out_autograd_meta); + + // Node Construction + auto grad_node = + std::shared_ptr(new Conv2dGradNodeFinal(1, 2)); + // SetAttributes if needed + grad_node->SetAttributestrides(strides); + grad_node->SetAttributepaddings(paddings); + grad_node->SetAttributepaddding_algorithm(paddding_algorithm); + grad_node->SetAttributegroups(groups); + grad_node->SetAttributedilations(dilations); + grad_node->SetAttributedata_format(data_format); + grad_node->SetAttributeuse_addto(use_addto); + grad_node->SetAttributeworkspace_size_MB(workspace_size_MB); + grad_node->SetAttributeexhaustive_search(exhaustive_search); + // Set TensorWrappers for Forward Inputs if needed + grad_node->SetTensorWrapperinput(input); + grad_node->SetTensorWrapperfilter(filter); + // SetGradOutMeta & SetEdges + grad_node->SetGradOutMeta(input, 0); + grad_node->SetGradOutMeta(filter, 1); + // SetOutRank & SetHistory & SetGradInMeta & RetainGrad + if (out_autograd_meta) { + egr::EagerUtils::SetOutRankWithSlot(out_autograd_meta, 0); + } + if (out_autograd_meta) { + egr::EagerUtils::SetHistory(out_autograd_meta, grad_node); + } + grad_node->SetGradInMeta(out, 0); + egr::EagerUtils::CheckAndRetainGrad(out); + // Set TensorWrappers for Forward Outputs if needed + } + + // Returns + return out; +} diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt new file mode 100644 index 0000000000000..21642fbd6495c --- /dev/null +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt @@ -0,0 +1,8 @@ +cc_library( + conv2d_nodes + SRCS conv2d_nodes.cc + DEPS ${eager_deps} ${fluid_deps}) + +set(eager_manual_nodes + conv2d_nodes + PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc new file mode 100644 index 0000000000000..ce8d647cb9ece --- /dev/null +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc @@ -0,0 +1,308 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/nan_inf_utils.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/api/backward/backward_api.h" +#include "paddle/phi/api/backward/sparse_bw_api.h" + +#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h" +#include "paddle/phi/api/include/sparse_api.h" +DECLARE_bool(check_nan_inf); + +paddle::small_vector, + egr::kSlotSmallVectorSize> +Conv2dGradNodeFinal::operator()( + paddle::small_vector, + egr::kSlotSmallVectorSize>& grads, + bool create_graph, + bool is_new_grad) { + // Fill Zero For GradIn Tensors + VLOG(3) << " Running Conv2dGradNodeFinal: " << this; + // Apply Gradient Hooks + auto hooked_grads = ApplyGradientHooks(grads); + + // Collect GradIn Tensors, Attrs and Recovered TensorWrappers + auto input = egr::EagerUtils::RecoverTensorWrapper(&this->input_); + auto filter = egr::EagerUtils::RecoverTensorWrapper(&this->filter_); + auto& grad_out = hooked_grads[0][0]; + auto& strides = this->strides_; + auto& paddings = this->paddings_; + auto& paddding_algorithm = this->paddding_algorithm_; + auto& groups = this->groups_; + auto& dilations = this->dilations_; + auto& data_format = this->data_format_; + auto& use_addto = this->use_addto_; + auto& workspace_size_MB = this->workspace_size_MB_; + auto& exhaustive_search = this->exhaustive_search_; + // Prepare Grad function call + + const auto& out_metas = OutputMeta(); + paddle::small_vector, + egr::kSlotSmallVectorSize> + returns(2); + for (int i = 0; i < 2; ++i) { + out_metas[i].size() == 0 ? returns[i].resize(1) + : returns[i].resize(out_metas[i].size()); + } + + auto* api_output_0 = + (out_metas[0].empty() || out_metas[0][0].IsStopGradient()) + ? nullptr + : &returns[0][0]; + auto* api_output_1 = + (out_metas[1].empty() || out_metas[1][0].IsStopGradient()) + ? nullptr + : &returns[1][0]; + // Runtime check if we need next grad + bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph; + + // Inplace Check + + // Inplace Strategy + + // Call grad_api function + VLOG(3) << "Final State Running: Conv2dGradNodeFinal"; + + paddle::experimental::conv2d_grad(input, + filter, + grad_out, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + api_output_0, + api_output_1); + // Check NaN and Inf id needed + if (FLAGS_check_nan_inf) { + egr::CheckTensorHasNanOrInf("conv2d_grad", returns); + } + + // Get GradOut autograd_meta + + auto& grad_input = returns[0][0]; + egr::AutogradMeta* grad_input_autograd_meta = + returns[0][0].initialized() ? egr::EagerUtils::autograd_meta(&grad_input) + : nullptr; + if (grad_input_autograd_meta) + grad_input_autograd_meta->SetStopGradient(false); + VLOG(3) << "Conv2dGradNodeFinal grad_input_autograd_meta: " + << grad_input_autograd_meta; + + auto& grad_filter = returns[1][0]; + egr::AutogradMeta* grad_filter_autograd_meta = + returns[1][0].initialized() ? egr::EagerUtils::autograd_meta(&grad_filter) + : nullptr; + if (grad_filter_autograd_meta) + grad_filter_autograd_meta->SetStopGradient(false); + VLOG(3) << "Conv2dGradNodeFinal grad_filter_autograd_meta: " + << grad_filter_autograd_meta; + + // Create Grad Node + if (trace_backward) { + paddle::platform::RecordEvent node_creation_record_event( + "conv2d_grad node_creation", + paddle::platform::TracerEventType::OperatorInner, + 1); + + // Node Construction + auto grad_node = std::shared_ptr( + new Conv2dDoubleGradNodeFinal(2, 3)); + // SetAttributes if needed + grad_node->SetAttributestrides(strides); + grad_node->SetAttributepaddings(paddings); + grad_node->SetAttributepaddding_algorithm(paddding_algorithm); + grad_node->SetAttributegroups(groups); + grad_node->SetAttributedilations(dilations); + grad_node->SetAttributedata_format(data_format); + grad_node->SetAttributeuse_addto(use_addto); + grad_node->SetAttributeworkspace_size_MB(workspace_size_MB); + grad_node->SetAttributeexhaustive_search(exhaustive_search); + // Set TensorWrappers for Forward Inputs if needed + grad_node->SetTensorWrapperinput(input); + grad_node->SetTensorWrapperfilter(filter); + grad_node->SetTensorWrappergrad_out(grad_out); + // SetGradOutMeta & SetEdges + if (grad_filter_autograd_meta) { + grad_node->SetGradOutMeta(input, 0); + } + if (grad_input_autograd_meta) { + grad_node->SetGradOutMeta(filter, 1); + grad_node->SetGradOutMeta(grad_out, 2); + } + // SetOutRank & SetHistory & SetGradInMeta & RetainGrad + if (grad_input_autograd_meta) { + egr::EagerUtils::SetOutRankWithSlot(grad_input_autograd_meta, 0); + } + if (grad_filter_autograd_meta) { + egr::EagerUtils::SetOutRankWithSlot(grad_filter_autograd_meta, 1); + } + if (grad_input_autograd_meta) { + egr::EagerUtils::SetHistory(grad_input_autograd_meta, grad_node); + } + if (grad_filter_autograd_meta) { + egr::EagerUtils::SetHistory(grad_filter_autograd_meta, grad_node); + } + grad_node->SetGradInMeta(grad_input, 0); + grad_node->SetGradInMeta(grad_filter, 1); + egr::EagerUtils::CheckAndRetainGrad(grad_input); + egr::EagerUtils::CheckAndRetainGrad(grad_filter); + // Set TensorWrappers for Forward Outputs if needed + } + + // Return + if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns); + return returns; +} + +paddle::small_vector, + egr::kSlotSmallVectorSize> +Conv2dDoubleGradNodeFinal::operator()( + paddle::small_vector, + egr::kSlotSmallVectorSize>& grads, + bool create_graph, + bool is_new_grad) { + // Fill Zero For GradIn Tensors + const auto& input_metas = this->InputMeta(); + egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[0][0], + input_metas[0][0]); + egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&grads[1][0], + input_metas[1][0]); + + // Apply Gradient Hooks + auto hooked_grads = ApplyGradientHooks(grads); + + // Collect GradIn Tensors, Attrs and Recovered TensorWrappers + auto input = egr::EagerUtils::RecoverTensorWrapper(&this->input_); + auto filter = egr::EagerUtils::RecoverTensorWrapper(&this->filter_); + auto grad_out = egr::EagerUtils::RecoverTensorWrapper(&this->grad_out_); + auto& grad_input_grad = hooked_grads[0][0]; + + paddle::optional grad_input_grad_optional; + if (grad_input_grad.initialized()) + grad_input_grad_optional = + paddle::make_optional(grad_input_grad); + + auto& grad_filter_grad = hooked_grads[1][0]; + + paddle::optional grad_filter_grad_optional; + if (grad_filter_grad.initialized()) + grad_filter_grad_optional = + paddle::make_optional(grad_filter_grad); + + auto& strides = this->strides_; + auto& paddings = this->paddings_; + auto& paddding_algorithm = this->paddding_algorithm_; + auto& groups = this->groups_; + auto& dilations = this->dilations_; + auto& data_format = this->data_format_; + auto& use_addto = this->use_addto_; + auto& workspace_size_MB = this->workspace_size_MB_; + auto& exhaustive_search = this->exhaustive_search_; + // Prepare Grad function call + + const auto& out_metas = OutputMeta(); + paddle::small_vector, + egr::kSlotSmallVectorSize> + returns(3); + for (int i = 0; i < 3; ++i) { + out_metas[i].size() == 0 ? returns[i].resize(1) + : returns[i].resize(out_metas[i].size()); + } + + auto* api_output_0 = + (out_metas[0].empty() || out_metas[0][0].IsStopGradient()) + ? nullptr + : &returns[0][0]; + auto* api_output_1 = + (out_metas[1].empty() || out_metas[1][0].IsStopGradient()) + ? nullptr + : &returns[1][0]; + auto* api_output_2 = + (out_metas[2].empty() || out_metas[2][0].IsStopGradient()) + ? nullptr + : &returns[2][0]; + // Runtime check if we need next grad + + // Inplace Check + + // Inplace Strategy + + // Call grad_api function + VLOG(3) << "Final State Running: Conv2dGradGradNodeFinal"; + + paddle::experimental::conv2d_grad_grad(input, + filter, + grad_out, + grad_input_grad_optional, + grad_filter_grad_optional, + strides, + paddings, + paddding_algorithm, + groups, + dilations, + data_format, + use_addto, + workspace_size_MB, + exhaustive_search, + api_output_0, + api_output_1, + api_output_2); + // Check NaN and Inf id needed + if (FLAGS_check_nan_inf) { + egr::CheckTensorHasNanOrInf("conv2d_grad_grad", returns); + } + + // Get GradOut autograd_meta + + auto& input_grad = returns[0][0]; + egr::AutogradMeta* input_grad_autograd_meta = + returns[0][0].initialized() ? egr::EagerUtils::autograd_meta(&input_grad) + : nullptr; + if (input_grad_autograd_meta) + input_grad_autograd_meta->SetStopGradient(false); + + auto& filter_grad = returns[1][0]; + egr::AutogradMeta* filter_grad_autograd_meta = + returns[1][0].initialized() ? egr::EagerUtils::autograd_meta(&filter_grad) + : nullptr; + if (filter_grad_autograd_meta) + filter_grad_autograd_meta->SetStopGradient(false); + + auto& grad_out_grad = returns[2][0]; + egr::AutogradMeta* grad_out_grad_autograd_meta = + returns[2][0].initialized() + ? egr::EagerUtils::autograd_meta(&grad_out_grad) + : nullptr; + if (grad_out_grad_autograd_meta) + grad_out_grad_autograd_meta->SetStopGradient(false); + + // Create Grad Node + + // Return + if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns); + return returns; +} diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h new file mode 100644 index 0000000000000..f202b64f0b709 --- /dev/null +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h @@ -0,0 +1,182 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tensor_wrapper.h" + +class Conv2dGradNodeFinal : public egr::GradNodeBase { + public: + Conv2dGradNodeFinal() : egr::GradNodeBase() {} + Conv2dGradNodeFinal(size_t bwd_in_slot_num, size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} + ~Conv2dGradNodeFinal() override = default; + + virtual paddle::small_vector, + egr::kSlotSmallVectorSize> + operator()( + paddle::small_vector, // NOLINT + egr::kSlotSmallVectorSize>& grads, // NOLINT + bool create_graph = false, // NOLINT + bool is_new_grad = false) override; // NOLINT + std::string name() override { return "Conv2dGradNodeFinal"; } + + void ClearTensorWrappers() override { + input_.clear(); + filter_.clear(); + + SetIsTensorWrappersCleared(true); + } + + std::shared_ptr Copy() const override { + auto copied_node = + std::shared_ptr(new Conv2dGradNodeFinal(*this)); + VLOG(3) << "Copy Conv2dGradNodeFinal: " << this + << " to: " << copied_node.get(); + return copied_node; + } + + // SetTensorWrapperX, SetTensorWrapperY, ... + void SetTensorWrapperinput(const paddle::experimental::Tensor& input) { + input_ = egr::TensorWrapper(input, false); + } + void SetTensorWrapperfilter(const paddle::experimental::Tensor& filter) { + filter_ = egr::TensorWrapper(filter, false); + } + + // SetAttributes + void SetAttributestrides(const std::vector& strides) { + strides_ = strides; + } + void SetAttributepaddings(const std::vector& paddings) { + paddings_ = paddings; + } + void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) { + paddding_algorithm_ = paddding_algorithm; + } + void SetAttributegroups(const int& groups) { groups_ = groups; } + void SetAttributedilations(const std::vector& dilations) { + dilations_ = dilations; + } + void SetAttributedata_format(const std::string& data_format) { + data_format_ = data_format; + } + void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; } + void SetAttributeworkspace_size_MB(const int& workspace_size_MB) { + workspace_size_MB_ = workspace_size_MB; + } + void SetAttributeexhaustive_search(const bool& exhaustive_search) { + exhaustive_search_ = exhaustive_search; + } + + private: + // TensorWrappers + egr::TensorWrapper input_; + egr::TensorWrapper filter_; + + // Attributes + std::vector strides_; + std::vector paddings_; + std::string paddding_algorithm_; + int groups_; + std::vector dilations_; + std::string data_format_; + bool use_addto_; + int workspace_size_MB_; + bool exhaustive_search_; +}; + +class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { + public: + Conv2dDoubleGradNodeFinal() : egr::GradNodeBase() {} + Conv2dDoubleGradNodeFinal(size_t bwd_in_slot_num, size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} + ~Conv2dDoubleGradNodeFinal() override = default; + + virtual paddle::small_vector, + egr::kSlotSmallVectorSize> + operator()( + paddle::small_vector, // NOLINT + egr::kSlotSmallVectorSize>& grads, // NOLINT + bool create_graph = false, // NOLINT + bool is_new_grad = false) override; // NOLINT + std::string name() override { return "Conv2dDoubleGradNodeFinal"; } + + void ClearTensorWrappers() override { + input_.clear(); + filter_.clear(); + grad_out_.clear(); + + SetIsTensorWrappersCleared(true); + } + + std::shared_ptr Copy() const override { + auto copied_node = std::shared_ptr( + new Conv2dDoubleGradNodeFinal(*this)); + return copied_node; + } + + // SetTensorWrapperX, SetTensorWrapperY, ... + void SetTensorWrapperinput(const paddle::experimental::Tensor& input) { + input_ = egr::TensorWrapper(input, false); + } + void SetTensorWrapperfilter(const paddle::experimental::Tensor& filter) { + filter_ = egr::TensorWrapper(filter, false); + } + void SetTensorWrappergrad_out(const paddle::experimental::Tensor& grad_out) { + grad_out_ = egr::TensorWrapper(grad_out, false); + } + + // SetAttributes + void SetAttributestrides(const std::vector& strides) { + strides_ = strides; + } + void SetAttributepaddings(const std::vector& paddings) { + paddings_ = paddings; + } + void SetAttributepaddding_algorithm(const std::string& paddding_algorithm) { + paddding_algorithm_ = paddding_algorithm; + } + void SetAttributegroups(const int& groups) { groups_ = groups; } + void SetAttributedilations(const std::vector& dilations) { + dilations_ = dilations; + } + void SetAttributedata_format(const std::string& data_format) { + data_format_ = data_format; + } + void SetAttributeuse_addto(const bool& use_addto) { use_addto_ = use_addto; } + void SetAttributeworkspace_size_MB(const int& workspace_size_MB) { + workspace_size_MB_ = workspace_size_MB; + } + void SetAttributeexhaustive_search(const bool& exhaustive_search) { + exhaustive_search_ = exhaustive_search; + } + + private: + // TensorWrappers + egr::TensorWrapper input_; + egr::TensorWrapper filter_; + egr::TensorWrapper grad_out_; + + // Attributes + std::vector strides_; + std::vector paddings_; + std::string paddding_algorithm_; + int groups_; + std::vector dilations_; + std::string data_format_; + bool use_addto_; + int workspace_size_MB_; + bool exhaustive_search_; +}; diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d406f00b25039..a6f5a36e389a9 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -40,6 +40,8 @@ # keeping the code compatible, here we also skip inplace check in new dygraph temporarily, # and this will be fixed in the futrue. inplace_check_blacklist = set(["assign_out_"]) +# # --- Black Ops list that's NO NEED to apply backward code generation +black_ops_list = ["conv2d", "conv2d_grad", "conv2d_grad_grad"] ########### @@ -154,9 +156,7 @@ class {} : public egr::GradNodeBase {{ {} // Prepare Grad function call {} - // Get GradIn autograd_meta -{} - // Compute Require Grad + // Runtime check if we need next grad {} // Inplace Check {} @@ -229,6 +229,27 @@ class {} : public egr::GradNodeBase {{ }} """ +HIHGER_ORDER_DERIVATIVE_VALUE_TEMPLATE = \ +""" if(trace_backward) {{ +{} + // Node Construction +{} + // SetAttributes if needed +{} + // Set TensorWrappers for Forward Inputs if needed +{} + // SetGradOutMeta & SetEdges +{} + // SetOutRank & SetHistory & SetGradInMeta & RetainGrad +{} +{} +{} +{} + // Set TensorWrappers for Forward Outputs if needed +{} + }} +""" + NAMESPACE_WRAPPER_TEMPLATE = \ """ namespace {} {{ @@ -252,7 +273,7 @@ class {} : public egr::GradNodeBase {{ #include "paddle/fluid/eager/nan_inf_utils.h" #include "paddle/phi/api/include/sparse_api.h" - +#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h" DECLARE_bool(check_nan_inf); {} """ @@ -279,7 +300,7 @@ class {} : public egr::GradNodeBase {{ #include "paddle/fluid/eager/eager_amp_auto_cast.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/fluid/eager/nan_inf_utils.h" - +#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h" DECLARE_bool(check_nan_inf); {} {} @@ -294,7 +315,7 @@ class {} : public egr::GradNodeBase {{ #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/eager/to_static/run_program_op_func.h" - +#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h" {} {} """ @@ -584,7 +605,6 @@ def CollectBackwardInfo(self): self.backward_api_name = forward_api_contents['backward'] self.backward_forward_str = grad_api_contents['forward'] - backward_args_str = grad_api_contents['args'] backward_returns_str = grad_api_contents['output'] @@ -663,7 +683,7 @@ def SlotNameMatching(self): backward_output_pos ] - def GenerateNodeCreationCodes(self): + def GenerateNodeCreationCodes(self, for_backward=False): forward_api_name = self.forward_api_name forward_inputs_position_map = self.forward_inputs_position_map forward_outputs_position_map = self.forward_outputs_position_map @@ -794,13 +814,21 @@ def GenerateNodeCreationCodes(self): node_event_name = forward_api_name + " node_creation" node_creation_event_str = f"{indent}paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::OperatorInner, 1);\n" + if not for_backward: + self.node_creation_str = FORWARD_BODY_TEMPLATE.format( + node_creation_event_str, pass_stop_gradient_args_str, + node_construction_str, set_attributes_str, + set_input_tensor_wrappers_str, set_grad_out_meta_str, + set_out_rank_str, set_history_str, set_grad_in_meta_str, + set_retain_grad_str, set_output_tensor_wrappers_str) + else: + self.node_creation_str = HIHGER_ORDER_DERIVATIVE_VALUE_TEMPLATE.format( + node_creation_event_str, node_construction_str, + set_attributes_str, set_input_tensor_wrappers_str, + set_grad_out_meta_str, set_out_rank_str, set_history_str, + set_grad_in_meta_str, set_retain_grad_str, + set_output_tensor_wrappers_str) - self.node_creation_str = FORWARD_BODY_TEMPLATE.format( - node_creation_event_str, pass_stop_gradient_args_str, - node_construction_str, set_attributes_str, - set_input_tensor_wrappers_str, set_grad_out_meta_str, - set_out_rank_str, set_history_str, set_grad_in_meta_str, - set_retain_grad_str, set_output_tensor_wrappers_str) self.grad_node_out_list = grad_node_out_list def run(self): @@ -1234,7 +1262,7 @@ def GenerateHigherOrderNodeCreationCode(self): next_node_generator = DygraphFunctionGeneratorBase( forward_api_contents, backward_api_contents, namespace) next_node_generator.run() - next_node_generator.GenerateNodeCreationCodes() + next_node_generator.GenerateNodeCreationCodes(True) next_grad_node_creation_str = next_node_generator.node_creation_str next_grad_node_out_list = next_node_generator.grad_node_out_list @@ -1342,6 +1370,7 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str, inplace_grad_input_str = "" inplaced_tensor_wrapper = False inplace_check_str = "" + optional_inplace_var_name = [] # Grad Ins from TensorWrappers for name, (_, is_fwd_input, grad_api_position), in backward_forward_inputs_map.items(): @@ -1351,6 +1380,13 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str, is_optional = (name in self.optional_inputs) tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});" if backward_inplace_map and name in backward_inplace_map.keys(): + if len(next_grad_node_creation_str) > 0: + if (transformed_tensor_name + in backward_forward_inputs_map_next) and ( + backward_forward_inputs_map_next[ + transformed_tensor_name][1]): + optional_inplace_var_name.append( + transformed_tensor_name) tensor_wrapper_intermidiate_tensor_str = f"(&this->{tensor_wrapper_name})->get_intermidiate_tensor()" inplace_check_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format( transformed_tensor_name, transformed_tensor_name, name, @@ -1371,7 +1407,6 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str, get_grad_in_args_list.append(tensor_wrapper_recover_str) - optional_inplace_check = False # Grad Ins from grads for name, (ttype, fwd_position, grad_api_position) in backward_grad_inputs_map.items(): @@ -1388,7 +1423,8 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str, in backward_forward_inputs_map_next) and ( backward_forward_inputs_map_next[ transformed_tensor_name][1]): - optional_inplace_check = False + optional_inplace_var_name.append( + transformed_tensor_name) grads_tensor_str = f"grads[{fwd_position}][0]" inplace_check_str += CHECK_BACKWARD_INPLACE_TEMPLATE.format( transformed_tensor_name, transformed_tensor_name, name, @@ -1441,25 +1477,25 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str, transformed_tensor_name = self.TransformToNextGradName(name) out_index = out_index + 1 grad_api_args.append(f"api_output_{out_index}") - if not optional_inplace_check: - optional_inplace_str = "VLOG(6) << \"No Inplace should happend for wrappered input\";" + if inplace_grad_input_str in optional_inplace_var_name: + optional_inplace_str = "VLOG(6) << \"No Inplace should happend for wrappered input: {inplace_grad_input_str}\";" else: optional_inplace_str = f"""if (api_output_{out_index} != nullptr && can_be_inplaced) {{ - egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index}); - }}""" + egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index}); + }}""" if IsPlainTensorType(ttype): if backward_inplace_map and name in backward_inplace_map.values( ): - inplace_str = f"""if (api_output_{out_index} != nullptr && can_be_inplaced) {{ - egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index}); - }}""" + inplace_str = f""" if (api_output_{out_index} != nullptr && can_be_inplaced) {{ + egr::EagerUtils::HandleViewBetweenInputAndOutput({inplace_grad_input_str}, api_output_{out_index}); + }}""" if len(next_grad_node_creation_str) > 0: inplace_for_grad_outs_str += f""" - if (!require_any_grad) {{ - {inplace_str} - }}else{{ + if (trace_backward) {{ {optional_inplace_str} + }} else {{ + {inplace_str} }}""" else: inplace_for_grad_outs_str += inplace_str @@ -1490,84 +1526,53 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str, backward_api_name, "returns") # Prepare for Node Creation if Necessary - inputs_autograd_meta_str = "" outputs_autograd_meta_str = "" - compute_require_grad_str = "" + compute_require_next_grad_str = "" if len(next_grad_node_creation_str) > 0: - # 1. Get Grad Input AutoGradMeta - inputs_autograd_meta_list = [] - compute_require_grad_args_list = ["trace_backward"] - for name, (ttype, pos, - grad_api_position) in backward_grad_inputs_map.items(): - transformed_tensor_name = self.TransformToNextGradName(name) - if transformed_tensor_name in next_grad_node_out_list: - input_autograd_meta_name = GetAutoGradMetaName( - transformed_tensor_name) - if IsPlainTensorType(ttype): - input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});" - else: - assert IsVectorTensorType(ttype) - input_autograd_meta_vec_name = GetAutoGradMetaVectorName( - transformed_tensor_name) - input_autograd_meta = f"{indent}std::vector {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n" - input_autograd_meta += f"{indent}std::vector* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};" - - inputs_autograd_meta_list.append(input_autograd_meta) - compute_require_grad_args_list.append( - input_autograd_meta_name) - - # 2. Get TensorWrapper AutoGradMeta - for name, (ttype, _, pos), in backward_forward_inputs_map.items(): - transformed_tensor_name = self.TransformToNextGradName(name) - if transformed_tensor_name in next_grad_node_out_list: - input_autograd_meta_name = GetAutoGradMetaName( - transformed_tensor_name) - if IsPlainTensorType(ttype): - input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});" - else: - assert IsVectorTensorType(ttype) - input_autograd_meta_vec_name = GetAutoGradMetaVectorName( - transformed_tensor_name) - input_autograd_meta = f"{indent}std::vector {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n" - input_autograd_meta += f"{indent}std::vector* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};" - - inputs_autograd_meta_list.append(input_autograd_meta) - compute_require_grad_args_list.append( - input_autograd_meta_name) - - inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list) - compute_require_grad_args_str = ",".join( - compute_require_grad_args_list) - - # 3. Get Output AutoGradMeta - outputs_autograd_meta_list = [] - num_fwd_outputs = len(backward_grad_outputs_map.keys()) - for name, (rtype, pos, - grad_api_position) in backward_grad_outputs_map.items(): - transformed_tensor_name = self.TransformToNextGradName(name) - - output_autograd_meta_name = GetAutoGradMetaName( - transformed_tensor_name) - output_autograd_meta_vec_name = GetAutoGradMetaVectorName( - transformed_tensor_name) - if IsPlainTensorType(rtype): - output_autograd_meta = f""" + compute_require_next_grad_str = f"{indent}bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;\n" + + # 3. Get Output AutoGradMeta + outputs_autograd_meta_list = [] + # TODO(jiabin): Optimize this with SetStopGradient instead of Pass Stop gradient + + num_fwd_outputs = len(backward_grad_outputs_map.keys()) + for name, (rtype, pos, + grad_api_position) in backward_grad_outputs_map.items(): + transformed_tensor_name = self.TransformToNextGradName(name) + + output_autograd_meta_name = GetAutoGradMetaName( + transformed_tensor_name) + output_autograd_meta_vec_name = GetAutoGradMetaVectorName( + transformed_tensor_name) + if IsPlainTensorType(rtype): + output_autograd_meta = f""" auto& {transformed_tensor_name} = returns[{pos}][0]; - egr::AutogradMeta* {output_autograd_meta_name} = returns[{pos}][0].initialized() ? egr::EagerUtils::autograd_meta(&{transformed_tensor_name}) : nullptr;""" + egr::AutogradMeta* {output_autograd_meta_name} = returns[{pos}][0].initialized() ? egr::EagerUtils::autograd_meta(&{transformed_tensor_name}) : nullptr; + if ({output_autograd_meta_name}) {output_autograd_meta_name}->SetStopGradient(false); + """ + else: + assert IsVectorTensorType(rtype) + if len(next_grad_node_creation_str) > 0: + output_autograd_meta = f""" + auto& {transformed_tensor_name} = returns[{pos}]; + std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name}); + std::vector* {output_autograd_meta_name} = &{output_autograd_meta_vec_name}; + for(auto* meta : {output_autograd_meta_vec_name}){{ + meta->SetStopGradient(false); + }} +""" else: - assert IsVectorTensorType(rtype) output_autograd_meta = f""" - auto& {transformed_tensor_name} = returns[{pos}]; - std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name}); - std::vector* {output_autograd_meta_name} = &{output_autograd_meta_vec_name}; + auto& {transformed_tensor_name} = returns[{pos}]; + std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name}); + for(auto* meta : {output_autograd_meta_vec_name}){{ + meta->SetStopGradient(false); + }} """ + outputs_autograd_meta_list.append(output_autograd_meta) - outputs_autograd_meta_list.append(output_autograd_meta) - outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list) - - compute_require_grad_str = f"{indent}bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;\n" - compute_require_grad_str += f"{indent}bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({compute_require_grad_args_str});" + outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list) returns_str = f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n" returns_str += f"{indent}return returns;\n" @@ -1576,11 +1581,10 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str, self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format( grad_node_name, fill_zero_str, get_grad_in_args_str, - grad_function_prepare_str, inputs_autograd_meta_str, - compute_require_grad_str, inplace_check_str, - inplace_for_grad_outs_str, grad_node_name, grad_function_call_str, - check_nan_inf_str, outputs_autograd_meta_str, - next_grad_node_creation_str, returns_str) + grad_function_prepare_str, compute_require_next_grad_str, + inplace_check_str, inplace_for_grad_outs_str, grad_node_name, + grad_function_call_str, check_nan_inf_str, + outputs_autograd_meta_str, next_grad_node_creation_str, returns_str) def run(self): super().run() @@ -1631,6 +1635,7 @@ def GetBackwardAPIContents(self, forward_api_contents): if 'backward' not in forward_api_contents.keys(): return None backward_api_name = forward_api_contents['backward'] + if backward_api_name in black_ops_list: return None assert backward_api_name in grad_api_dict.keys(), AssertMessage( backward_api_name, grad_api_dict.keys()) backward_api_contents = grad_api_dict[backward_api_name] @@ -1646,7 +1651,7 @@ def GenerateCode(self): backward_api_contents = self.GetBackwardAPIContents( forward_api_contents) if backward_api_contents is None: continue - + if forward_api_contents['api'] in black_ops_list: continue # Generate Dygraph Forward Function function_generator = DygraphForwardFunctionGenerator( forward_api_contents, backward_api_contents, namespace) diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 26165c59e0153..c4797029abf3c 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -52,7 +52,14 @@ class GeneralGrad { AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(inputs[i]); auto* target_node = auto_grad_meta->GetMutableGradNode().get(); - + VLOG(8) << "Get no grad vars' grad_node: " << target_node->name() + << ", " << target_node << " with output rank info: " + << auto_grad_meta->OutRankInfo().first << ", " + << auto_grad_meta->OutRankInfo().second; + if (is_no_grad_vars) { + (no_grad_var_nodes_inputmeta_map_)[target_node] = auto_grad_meta; + continue; + } if (orig_to_copied_node_mapping_.count(target_node)) { target_node = orig_to_copied_node_mapping_[target_node].get(); } else { @@ -67,11 +74,8 @@ class GeneralGrad { "stop_gradient=True.", msg, i)); - if (is_no_grad_vars) { - (no_grad_var_nodes_inputmeta_map_)[target_node] = auto_grad_meta; - } else { // normal input - (input_target_nodes_inputmeta_map_)[target_node] = auto_grad_meta; - } + // normal input + (input_target_nodes_inputmeta_map_)[target_node] = auto_grad_meta; } } } @@ -305,8 +309,6 @@ class GeneralGrad { const std::unordered_map>& node_input_buffers_dict) { - // Get no_grad_vars's GradNodes and InputMeta Info - GetTargetNodesInfo(no_grad_vars, true /* is_no_grad_vars */); // Get inputs's GradNodes and InputMeta Info GetTargetNodesInfo(inputs, false /* is_no_grad_vars */); // Purify potentialstartup_ops, remove those nodes that are the same as @@ -402,6 +404,21 @@ class GeneralGrad { std::shared_ptr orig_next_node = orig_edge.GetMutableGradNode(); + + if (no_grad_var_nodes_inputmeta_map_.count(orig_next_node.get()) && + (no_grad_var_nodes_inputmeta_map_[orig_next_node.get()] + ->OutRankInfo() == orig_edge.GetEdgeRankInfo())) { + VLOG(3) << "Get no grad edge from grad_node: " << orig_node->name() + << " : " << orig_node << " to:" << orig_next_node->name() + << ", " << orig_next_node.get() + << " with output rank info: " + << orig_edge.GetEdgeRankInfo().first << ", " + << orig_edge.GetEdgeRankInfo().second; + // Stop no grad var's preceding node + copied_node->MutableOutputMeta()[i][j].SetStopGradient(true); + copied_edge.Clear(); + continue; + } if (!orig_next_node) continue; // Copy Next Node @@ -638,6 +655,9 @@ std::vector RunBackward( } if (is_general_grad) { + // Get no_grad_vars's GradNodes and InputMeta Info + GeneralGrad::Instance().GetTargetNodesInfo(no_grad_vars, + true /* is_no_grad_vars */); // Copy Backward Graph GeneralGrad::Instance().ReconstructBackwardGraph(orig_queue); } @@ -696,19 +716,6 @@ std::vector RunBackward( node); } - // no_grad_vars - if (!no_grad_vars.empty() && is_general_grad) { - auto iter = - GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->find(node); - if (iter != - GeneralGrad::Instance().GetNoGradVarNodesInputMetaMap()->end()) { - VLOG(6) << "Change the input buffer[slot][rank] by Zeros"; - auto rank_info = (iter->second)->OutRankInfo(); - node_input_buffer->SetBufferSlotRankZeros(rank_info.first, - rank_info.second); - } - } - // Check input EnforceGradNodeHasInput(node); @@ -750,7 +757,8 @@ std::vector RunBackward( // Since we make edge has as same rank as bwd outputs, we indexing them // with the same rank(i, j) auto next_node_shared = edge.GetMutableGradNode(); - VLOG(3) << "Found pending node: " << next_node_shared->name(); + VLOG(3) << "Found pending node: " << next_node_shared->name() << ": " + << next_node_shared.get(); // Next node could be nullptr if it is leaf tensor with no // AccumulationNode attached // Or it could also originated from dispensable inputs @@ -800,6 +808,8 @@ std::vector RunBackward( // Update queue node_in_degree_map[next_node]--; + VLOG(6) << next_node->name() + << " ref_cnt is: " << node_in_degree_map[next_node]; PADDLE_ENFORCE( node_in_degree_map[next_node] >= 0, diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 269753f3c04f9..2f8ca2bb42095 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -106,6 +106,12 @@ class Edge { } } + void Clear() { + grad_node_.reset(); + in_slot_id_ = 0; + in_rank_ = 0; + } + private: size_t in_slot_id_; size_t in_rank_; diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index c8d8b9ab548c0..231d81b5e73a6 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -24,6 +24,7 @@ namespace egr { void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) { + // Set not grad var to zero and set stop gradient as default value: true buffer_[slot_id][rank] = paddle::experimental::zeros_like(buffer_[slot_id][rank]); } @@ -59,8 +60,15 @@ void GradTensorHolder::CopyValueFromTensor( if ((!buffer_tensor.defined() || !buffer_tensor.initialized())) { // Perform deep copy here buffer_tensor.copy_(t, t.place(), false); - buffer_tensor.set_autograd_meta(t.mutable_autograd_meta()); - + auto* meta = egr::EagerUtils::autograd_meta(&buffer_tensor); + auto* origin_meta = egr::EagerUtils::nullable_autograd_meta(t); + if (origin_meta) { + auto grad_node = origin_meta->GetMutableGradNode(); + if (grad_node && grad_node.get()) { + meta->SetGradNode(origin_meta->GetMutableGradNode()); + } + meta->WeakGrad() = origin_meta->WeakGrad(); + } } else { PADDLE_THROW(paddle::platform::errors::Fatal( "Cannot copy grad_tensors' value to grad tensor holders," @@ -81,10 +89,10 @@ void GradTensorHolder::CopyValueFromTensor( "Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR " "now.")); } - egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank])) - ->SetStopGradient(false); } } + egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank])) + ->SetStopGradient(false); } void GradTensorHolder::add(size_t slot_id, diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 66c13c66de9fc..a6fd57ac6a4bc 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -28,6 +28,7 @@ #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/utils.h" +#include "paddle/phi/api/lib/utils/allocator.h" namespace egr { class TensorWrapper { @@ -57,9 +58,12 @@ class TensorWrapper { // Only Copy Meta phi::DenseTensor* dense_tensor = static_cast(tensor.impl().get()); - auto tw_dense_tensor = std::make_shared(); - tw_dense_tensor->set_meta(dense_tensor->meta()); - intermidiate_tensor_.set_impl(tw_dense_tensor); + // TODO(jiabin): It's not a good idea to set memory size to zero, find + // another way and change this. + intermidiate_tensor_.set_impl( + std::move(std::make_shared( + std::make_shared(nullptr, 0, tensor.place()), + std::move(dense_tensor->meta())))); } else { PADDLE_THROW(paddle::platform::errors::Fatal( "Unrecognized tensor type for no_need_buffer feature")); diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index 7c0243caf6abf..f445632de8c5d 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -98,6 +98,7 @@ static void GetGraphInfoBetweenTargets( auto &grad_node = output_target->GradVarBase()->GradNode(); if (visited.count(grad_node.get()) == 0) { for (auto &op : *grad_node) { + VLOG(10) << "Pushed op: " << op.Type(); q.emplace(&op, grad_node.get()); } } @@ -141,6 +142,8 @@ static void GetGraphInfoBetweenTargets( for (auto &pending_node : node->GradPendingNodes()) { for (auto &pending_op : *pending_node) { preceding_ops[&pending_op].insert(op); + VLOG(10) << "Find preceding op of: " << pending_op.Type() + << " is: " << op->Type(); } if (visited.count(pending_node.get()) == 0) { visited.insert(pending_node.get()); @@ -175,6 +178,7 @@ static void GetGraphInfoBetweenTargets( std::queue> op_queue; std::unordered_set, HashPair> op_base_visited; for (auto &endpoint_op : endpoint_ops) { + VLOG(10) << "Emplaced endpoint op: " << endpoint_op->Type(); op_queue.emplace(endpoint_op, nullptr); op_base_visited.emplace(endpoint_op, nullptr); } @@ -186,14 +190,18 @@ static void GetGraphInfoBetweenTargets( op_queue.pop(); + VLOG(10) << "Get op: " << op->Type(); + bool is_valid = false; for (auto &output_pair : op->GetOutsMap()) { if (!output_pair.second.IsGrad()) { + VLOG(10) << "Continueded output for : " << op->Type(); continue; } for (auto &out_var : output_pair.second) { if (out_var && target_vars.count(out_var.get()) > 0) { + VLOG(10) << "Find target output for : " << op->Type(); is_valid = true; break; } @@ -211,11 +219,13 @@ static void GetGraphInfoBetweenTargets( is_valid = false; for (auto &input_pair : op->GetInsMap()) { if (!input_pair.second.IsGrad()) { + VLOG(10) << "Continueded input for : " << op->Type(); continue; } for (auto &in_var : input_pair.second) { if (in_var && no_grad_var_grad.count(in_var.get()) == 0) { + VLOG(10) << "Find not no grad var in input for : " << op->Type(); target_vars.insert(in_var.get()); is_valid = true; } @@ -240,7 +250,10 @@ static void GetGraphInfoBetweenTargets( auto iter = preceding_ops.find(op); if (iter != preceding_ops.end()) { for (auto &preceding_op : iter->second) { + VLOG(10) << "Scan preceding op: " << preceding_op->Type() << " for " + << op->Type(); if (op_base_visited.count(std::make_pair(preceding_op, op)) == 0) { + VLOG(10) << "Emplace op: " << preceding_op->Type(); op_queue.emplace(preceding_op, op); op_base_visited.emplace(preceding_op, op); } @@ -648,6 +661,7 @@ PartialGradTask::PartialGradTask( platform::errors::Unimplemented( "only_inputs=False is not supported yet")); + VLOG(10) << "no_grad_vars size: " << no_grad_vars.size(); for (auto &var : no_grad_vars) { if (var && var->GradVarBase()) { no_grad_var_grad_.insert(var->GradVarBase()->SharedVar().get()); @@ -853,6 +867,7 @@ std::vector> PartialGradTask::Run() { } for (auto &pending_op : iter->second) { + VLOG(10) << "Find pending op" << pending_op->Type(); auto dep_iter = op_deps_.find(pending_op); PADDLE_ENFORCE_EQ( dep_iter != op_deps_.end(), @@ -862,6 +877,7 @@ std::vector> PartialGradTask::Run() { if (--(dep_iter->second) == 0) { q.push(pending_op); } + VLOG(10) << "Pending op deps: " << dep_iter->second; } } diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 7ed4346ed82c2..c0ff0914401b5 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -82,7 +82,7 @@ std::shared_ptr> PrepareData( auto& template_var = name_pair.second[i]; SetForwardDataTypeOfGradVar(template_var); const auto* tensor = GetTensorFromVar(template_var->Var()); - if (tensor && tensor->IsInitialized()) { + if (tensor && tensor->IsInitialized() && (tensor->memory_size() != 0)) { auto kernel_type_for_var = op.GetKernelTypeForVar( name_pair.first, *tensor, expected_kernel_key); if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) { @@ -91,7 +91,8 @@ std::shared_ptr> PrepareData( VLOG(3) << "Transform Variable " << GetNameFromVar(template_var) << " from " << kernel_type_for_var << " to " << expected_kernel_key; - + VLOG(3) << GetNameFromVar(template_var) + << " memory size is: " << tensor->memory_size(); if (CheckCachedKey(template_var, expected_kernel_key)) { VLOG(3) << "Hit variable_wrapper cache: key=" << expected_kernel_key; @@ -634,7 +635,8 @@ void PreparePhiData(const phi::Kernel& phi_kernel, for (size_t offset = 0; offset < ins_vector.size(); ++offset) { auto& var = ins_vector[offset]; const auto* tensor_in = GetTensorFromVar(var->Var()); - if (tensor_in && tensor_in->IsInitialized()) { + if (tensor_in && tensor_in->IsInitialized() && + (tensor_in->memory_size() != 0)) { if (in_def.backend == phi::Backend::ALL_BACKEND) { continue; } diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 7f54f472bdcd5..f436d0e96b5dc 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -98,10 +98,11 @@ void EmptyTensorInitializer(TensorObject* self, } if (!autograd_meta->GetMutableGradNode()) { - VLOG(3) << "Tensor(" << name - << ") have not GradNode, add GradNodeAccumulation for it."; autograd_meta->SetGradNode( std::make_shared(autograd_meta)); + VLOG(3) << "Tensor(" << name + << ") have not GradNode, add GradNodeAccumulation" + << autograd_meta->GradNode() << " for it."; } } diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index a7f11fc963ebe..12e262b3f7cb5 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -95,6 +95,7 @@ PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) { EAGER_TRY VLOG(6) << "Get grad for tensor: " << self->tensor.name(); auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor); + VLOG(6) << meta << " initialized: " << meta->Grad().initialized(); if (meta && meta->Grad().initialized()) { return ToPyObject(meta->Grad()); } else { diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index f50323cef216c..5a5aa9638a3be 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -257,8 +257,8 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp} ${bw_api_source_file} COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}" - DEPENDS ${bw_api_yaml_file} ${legacy_bw_api_yaml_file} ${bw_api_gen_file} - ${api_gen_base} + DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file} ${api_gen_base} + ${legacy_bw_api_yaml_file} VERBATIM) # generate sparse api diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index a4589120cc475..4af32c7e4cfa0 100644 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -133,6 +133,17 @@ func : asinh_grad inplace : (out_grad -> x_grad) +- backward_api : assign_double_grad + forward : assign_grad (Tensor grad_out) -> Tensor(grad_x) + args : (Tensor grad_x_grad) + output : Tensor(grad_out_grad) + infer_meta : + func : UnchangedInferMeta + kernel : + func : assign + backward: assign_triple_grad + inplace : (grad_x_grad -> grad_out_grad) + - backward_api : assign_grad forward : assign (Tensor x) -> Tensor(out) args : (Tensor out_grad) @@ -141,6 +152,7 @@ func : UnchangedInferMeta kernel : func : assign + backward: assign_double_grad inplace : (out_grad -> x_grad) - backward_api : assign_out__grad @@ -153,6 +165,16 @@ func : assign inplace : (out_grad -> x_grad) +- backward_api : assign_triple_grad + forward : assign_double_grad (Tensor grad_out) -> Tensor(grad_x) + args : (Tensor grad_x_grad) + output : Tensor(grad_out_grad) + infer_meta : + func : UnchangedInferMeta + kernel : + func : assign + inplace : (grad_x_grad -> grad_out_grad) + - backward_api : atan_grad forward : atan (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) @@ -1823,6 +1845,16 @@ func : sinh_grad inplace : (out_grad -> x_grad) +- backward_api : slice_double_grad + forward : slice_grad (Tensor input, Tensor grad_out, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(grad_input) + args : (Tensor grad_input_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) + output : Tensor(grad_out_grad) + infer_meta : + func : UnchangedInferMeta + param : [grad_input_grad] + kernel : + func : slice + - backward_api : slice_grad forward : slice (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(out) args : (Tensor input, Tensor out_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) @@ -1832,6 +1864,7 @@ param : [input] kernel : func : slice_grad + backward : slice_double_grad no_need_buffer : input - backward_api : soft_shrink_grad From 9aaae254d5d4c46825d1627edff90c8e5bf9ee96 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Thu, 7 Jul 2022 12:38:14 +0800 Subject: [PATCH 089/250] Fix dev ctx with cuda graph (#44109) --- paddle/fluid/platform/CMakeLists.txt | 4 ++ .../platform/cuda_graph_with_memory_pool.cc | 12 +++- .../device_context_test_cuda_graph.cu | 39 +++++++++++++ paddle/phi/core/device_context.cc | 57 +++++++++++++++++++ paddle/phi/core/device_context.h | 27 +++++++++ .../fluid/tests/unittests/test_cuda_graph.py | 10 ++++ 6 files changed, 147 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/platform/device_context_test_cuda_graph.cu diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index dc6911aecf130..efe0479871215 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -297,6 +297,10 @@ if(WITH_GPU) device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) + nv_test( + device_context_test_cuda_graph + SRCS device_context_test_cuda_graph.cu + DEPS device_context gpu_info cuda_graph_with_memory_pool) nv_test( transform_test SRCS transform_test.cu diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc index eb9f1ca845a28..bfdf492962de3 100644 --- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -26,7 +26,9 @@ namespace platform { void BeginCUDAGraphCapture(platform::CUDAPlace place, cudaStreamCaptureMode mode, int64_t pool_id) { - auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place); + auto* dev_ctx = + reinterpret_cast(mutable_dev_ctx); dev_ctx->cudnn_workspace_handle().ResetWorkspace(); // After PR(#43206), cudnn related initializations will change to lazy mode. @@ -49,6 +51,9 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place, pool_id = CUDAGraph::SetMemoryPoolID(pool_id); memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph( pool_id); + dev_ctx->SetCUDAGraphAllocator(memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place) + .get()); if (old_value) { FLAGS_use_stream_safe_cuda_allocator = true; } @@ -60,8 +65,11 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place, std::unique_ptr EndCUDAGraphCapture() { auto place = CUDAGraph::CapturingPlace(); - auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + auto* mutable_dev_ctx = platform::DeviceContextPool::Instance().Get(place); + auto* dev_ctx = + reinterpret_cast(mutable_dev_ctx); dev_ctx->cudnn_workspace_handle().ResetWorkspace(); + dev_ctx->SetCUDAGraphAllocator(nullptr); return CUDAGraph::EndCapture(); } #endif diff --git a/paddle/fluid/platform/device_context_test_cuda_graph.cu b/paddle/fluid/platform/device_context_test_cuda_graph.cu new file mode 100644 index 0000000000000..9f5a551743ed1 --- /dev/null +++ b/paddle/fluid/platform/device_context_test_cuda_graph.cu @@ -0,0 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "cuda.h" // NOLINT +#include "cuda_runtime.h" // NOLINT +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/platform/device_context.h" + +TEST(Device, DeviceContextWithCUDAGraph) { + using paddle::platform::CUDADeviceContext; + using paddle::platform::CUDAPlace; + using paddle::platform::DeviceContext; + using paddle::platform::DeviceContextPool; + using paddle::platform::Place; + + DeviceContextPool& pool = DeviceContextPool::Instance(); + Place place = CUDAPlace(0); + auto* dev_ctx = pool.Get(place); + + paddle::platform::BeginCUDAGraphCapture( + place, cudaStreamCaptureMode::cudaStreamCaptureModeThreadLocal, 0); + ASSERT_EQ(dev_ctx->IsCUDAGraphAllocatorValid(), true); + dev_ctx->GetCUDAGraphAllocator(); + paddle::platform::EndCUDAGraphCapture(); + ASSERT_EQ(dev_ctx->IsCUDAGraphAllocatorValid(), false); +} diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index ce57f4f627baa..fc85fc32f62a8 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -14,6 +14,10 @@ #include "paddle/phi/core/device_context.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" +#endif + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/selected_rows.h" @@ -58,6 +62,26 @@ struct DeviceContext::Impl { pinned_allocator_ = allocator; } +#ifdef PADDLE_WITH_CUDA + void SetCUDAGraphAllocator(const Allocator* allocator) { + // NOTE (Yuang): cuda graph allocator can be set to nullptr, so don't check + // validation of the allocator here + cuda_graph_allocator_ = allocator; + } + + const Allocator& GetCUDAGraphAllocator() const { + PADDLE_ENFORCE_NOT_NULL(cuda_graph_allocator_, + phi::errors::InvalidArgument( + "Required cuda_graph_allocator_ shall not be " + "nullptr, but received nullptr.")); + return *cuda_graph_allocator_; + } + + bool IsCUDAGraphAllocatorValid() const { + return cuda_graph_allocator_ != nullptr; + } +#endif + const Allocator& GetAllocator() const { PADDLE_ENFORCE_NOT_NULL( device_allocator_, @@ -111,6 +135,17 @@ struct DeviceContext::Impl { auto* allocator = tensor->numel() == 0 ? zero_allocator_ : (pinned ? pinned_allocator_ : device_allocator_); +#ifdef PADDLE_WITH_CUDA + bool must_cuda_graph_allocator = (tensor->numel() != 0) && !pinned; + if (must_cuda_graph_allocator && paddle::platform::is_gpu_place(place) && + paddle::platform::CUDAGraph::IsThisThreadCapturing()) { + PADDLE_ENFORCE_NOT_NULL(cuda_graph_allocator_, + phi::errors::InvalidArgument( + "Required cuda_graph_allocator_ shall not be " + "nullptr, but received nullptr.")); + allocator = cuda_graph_allocator_; + } +#endif return tensor->AllocateFrom( const_cast(allocator), dtype, requested_size); } @@ -200,6 +235,9 @@ struct DeviceContext::Impl { const Allocator* host_allocator_{nullptr}; const Allocator* zero_allocator_{nullptr}; const Allocator* pinned_allocator_{nullptr}; +#ifdef PADDLE_WITH_CUDA + const Allocator* cuda_graph_allocator_{nullptr}; +#endif Generator* device_generator_{nullptr}; Generator* host_generator_{nullptr}; }; @@ -213,6 +251,11 @@ DeviceContext::DeviceContext(const DeviceContext& other) { impl_->SetPinnedAllocator(&other.GetPinnedAllocator()); impl_->SetHostGenerator(other.GetHostGenerator()); impl_->SetGenerator(other.GetGenerator()); +#ifdef PADDLE_WITH_CUDA + if (other.IsCUDAGraphAllocatorValid()) { + impl_->SetCUDAGraphAllocator(&other.GetCUDAGraphAllocator()); + } +#endif } DeviceContext::DeviceContext(DeviceContext&& other) { @@ -239,6 +282,20 @@ const Allocator& DeviceContext::GetHostAllocator() const { return impl_->GetHostAllocator(); } +#ifdef PADDLE_WITH_CUDA +void DeviceContext::SetCUDAGraphAllocator(const Allocator* allocator) { + impl_->SetCUDAGraphAllocator(allocator); +} + +const Allocator& DeviceContext::GetCUDAGraphAllocator() const { + return impl_->GetCUDAGraphAllocator(); +} + +bool DeviceContext::IsCUDAGraphAllocatorValid() const { + return impl_->IsCUDAGraphAllocatorValid(); +} +#endif + void DeviceContext::SetZeroAllocator(const Allocator* allocator) { impl_->SetZeroAllocator(allocator); } diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index 45e4fbf64dc04..32dbb0c0a357c 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -106,6 +106,33 @@ class PADDLE_API DeviceContext { const Allocator& GetPinnedAllocator() const; +#ifdef PADDLE_WITH_CUDA + /** + * @brief Set the CUDA graph Allocator object. + * + * @param allocator + */ + void SetCUDAGraphAllocator(const Allocator*); + + /** + * @brief Get the const CUDA graph Allocator object. + * + * @return Allocator + */ + const Allocator& GetCUDAGraphAllocator() const; + + /** + * @brief Test whether the CUDA graph allocator is valid + * + * This method should be called before calling GetCUDAGraphAllocator(). + * Other unit can calls GetCUDAGraphAllocator() method, + * only when this method returns True! + * + * @return true if cuda_graph_allocator_ is valid, false otherwise + */ + bool IsCUDAGraphAllocatorValid() const; +#endif + /** * @brief Allocate device memory for tensor. */ diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py index fda3fa79ef664..446a5500bc30b 100644 --- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py +++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py @@ -236,6 +236,16 @@ def __getitem__(self, idx): self.assertTrue(np.array_equal(actual_x, x.numpy())) self.assertTrue(np.array_equal(actual_y, y.numpy())) + def test_dev_ctx_alloc(self): + if not can_use_cuda_graph(): + return + + x = paddle.to_tensor([2], dtype='float32') + graph = CUDAGraph() + graph.capture_begin() + y = paddle.cast(x, dtype='float16') + graph.capture_end() + if __name__ == "__main__": unittest.main() From 2afa9b7652c924a589244a946a109f1d4651f343 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Thu, 7 Jul 2022 12:54:19 +0800 Subject: [PATCH 090/250] [Eager] Menual fused attention in eager (#43974) * fused_gate_attention manual code in eager --- .../manual/fluid_manual/dygraph_forward_api.h | 34 + .../fluid_manual/forwards/CMakeLists.txt | 8 + .../forwards/fused_attention_fwd_func.cc | 628 ++++++++++++++++++ .../manual/fluid_manual/nodes/CMakeLists.txt | 7 +- .../nodes/fused_attention_node.cc | 366 ++++++++++ .../api/manual/fluid_manual/nodes/nodes.h | 202 ++++++ .../auto_code_generator/eager_generator.cc | 6 +- .../unittests/test_fused_attention_op.py | 4 +- .../unittests/test_fused_gate_attention_op.py | 4 +- 9 files changed, 1250 insertions(+), 9 deletions(-) create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc create mode 100644 paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc diff --git a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h index 397e549e61473..91d556f9557dc 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h +++ b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h @@ -67,3 +67,37 @@ fused_feedforward_dygraph_function( const paddle::experimental::Tensor& Ln2Scale, const paddle::experimental::Tensor& Ln2Bias, const paddle::framework::AttributeMap& attr_map); + +std::tuple +fused_attention_dygraph_function( + const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& LnScale, + const paddle::experimental::Tensor& LnBias, + const paddle::experimental::Tensor& QKVW, + const paddle::experimental::Tensor& QKVBias, + const paddle::experimental::Tensor& CacheKV, + const paddle::experimental::Tensor& SrcMask, + const paddle::experimental::Tensor& OutLinearW, + const paddle::experimental::Tensor& OutLinearBias, + const paddle::experimental::Tensor& Ln2Scale, + const paddle::experimental::Tensor& Ln2Bias, + const paddle::framework::AttributeMap& attr_map); diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt index 305df1c92c6e1..4912663ef1f54 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt @@ -12,6 +12,14 @@ cc_library( add_dependencies(fused_feedforward_fwd_func eager_codegen) +cc_library( + fused_attention_fwd_func + SRCS fused_attention_fwd_func.cc + DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + +add_dependencies(fused_attention_fwd_func eager_codegen) + set(fluid_manual_functions fused_gate_attention_fwd_func fused_feedforward_fwd_func + fused_attention_fwd_func PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc new file mode 100644 index 0000000000000..b058fa50acdd9 --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc @@ -0,0 +1,628 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/amp_auto_cast.h" +#include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" + +#pragma GCC diagnostic ignored "-Wunused-variable" + +std::tuple +fused_attention_dygraph_function( + const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& LnScale, + const paddle::experimental::Tensor& LnBias, + const paddle::experimental::Tensor& QKVW, + const paddle::experimental::Tensor& QKVBias, + const paddle::experimental::Tensor& CacheKV, + const paddle::experimental::Tensor& SrcMask, + const paddle::experimental::Tensor& OutLinearW, + const paddle::experimental::Tensor& OutLinearBias, + const paddle::experimental::Tensor& Ln2Scale, + const paddle::experimental::Tensor& Ln2Bias, + const paddle::framework::AttributeMap& attr_map) { + paddle::platform::RecordEvent dygraph_entrance_record_event( + "fused_attention dygraph", + paddle::platform::TracerEventType::Operator, + 1); + VLOG(3) << "Running Eager Forward Op: fused_attention"; + // Dygraph Forward Pass + + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + VLOG(5) << "Check and Prepare For AMP"; + + paddle::small_vector, + egr::kSlotSmallVectorSize> + amp_tensors_vector = {{X}, {QKVW}, {OutLinearW}}; + if (LnScale.initialized()) amp_tensors_vector.push_back({LnScale}); + if (LnBias.initialized()) amp_tensors_vector.push_back({LnBias}); + if (QKVBias.initialized()) amp_tensors_vector.push_back({QKVBias}); + if (CacheKV.initialized()) amp_tensors_vector.push_back({CacheKV}); + if (SrcMask.initialized()) amp_tensors_vector.push_back({SrcMask}); + if (OutLinearBias.initialized()) + amp_tensors_vector.push_back({OutLinearBias}); + if (Ln2Scale.initialized()) amp_tensors_vector.push_back({Ln2Scale}); + if (Ln2Bias.initialized()) amp_tensors_vector.push_back({Ln2Bias}); + + auto amp_dst_dtype = + egr::GetAmpDestDtype("fused_attention", amp_tensors_vector); + + auto NEW_X = egr::AmpAutoCast("X", X, amp_dst_dtype, "fused_attention"); + auto NEW_QKVW = + egr::AmpAutoCast("QKVW", QKVW, amp_dst_dtype, "fused_attention"); + auto NEW_OutLinearW = egr::AmpAutoCast( + "OutLinearW", OutLinearW, amp_dst_dtype, "fused_attention"); + auto NEW_LnScale = + ((LnScale.initialized()) + ? egr::AmpAutoCast( + "LnScale", LnScale, amp_dst_dtype, "fused_attention") + : LnScale); + auto NEW_LnBias = + ((LnBias.initialized()) + ? egr::AmpAutoCast( + "LnBias", LnBias, amp_dst_dtype, "fused_attention") + : LnBias); + auto NEW_QKVBias = + ((QKVBias.initialized()) + ? egr::AmpAutoCast( + "QKVBias", QKVBias, amp_dst_dtype, "fused_attention") + : QKVBias); + auto NEW_CacheKV = + ((CacheKV.initialized()) + ? egr::AmpAutoCast( + "CacheKV", CacheKV, amp_dst_dtype, "fused_attention") + : CacheKV); + auto NEW_SrcMask = + ((SrcMask.initialized()) + ? egr::AmpAutoCast( + "SrcMask", SrcMask, amp_dst_dtype, "fused_attention") + : SrcMask); + auto NEW_OutLinearBias = + ((OutLinearBias.initialized()) ? egr::AmpAutoCast("OutLinearBias", + OutLinearBias, + amp_dst_dtype, + "fused_attention") + : OutLinearBias); + auto NEW_Ln2Scale = + ((Ln2Scale.initialized()) + ? egr::AmpAutoCast( + "Ln2Scale", Ln2Scale, amp_dst_dtype, "fused_attention") + : Ln2Scale); + auto NEW_Ln2Bias = + ((Ln2Bias.initialized()) + ? egr::AmpAutoCast( + "Ln2Bias", Ln2Bias, amp_dst_dtype, "fused_attention") + : Ln2Bias); + + { + paddle::imperative::AutoCastGuard guard( + egr::Controller::Instance().GetCurrentTracer(), + paddle::imperative::AmpLevel::O0); + return fused_attention_dygraph_function(NEW_X, + NEW_LnScale, + NEW_LnBias, + NEW_QKVW, + NEW_QKVBias, + NEW_CacheKV, + NEW_SrcMask, + NEW_OutLinearW, + NEW_OutLinearBias, + NEW_Ln2Scale, + NEW_Ln2Bias, + attr_map); + } + } + + std::map>> ins = + {{"X", egr::EagerUtils::TrySyncToVars(X)}, + {"QKVW", egr::EagerUtils::TrySyncToVars(QKVW)}, + {"OutLinearW", egr::EagerUtils::TrySyncToVars(OutLinearW)}}; + if (LnScale.initialized()) + ins["LnScale"] = egr::EagerUtils::TrySyncToVars(LnScale); + if (LnBias.initialized()) + ins["LnBias"] = egr::EagerUtils::TrySyncToVars(LnBias); + if (QKVBias.initialized()) + ins["QKVBias"] = egr::EagerUtils::TrySyncToVars(QKVBias); + if (CacheKV.initialized()) + ins["CacheKV"] = egr::EagerUtils::TrySyncToVars(CacheKV); + if (SrcMask.initialized()) + ins["SrcMask"] = egr::EagerUtils::TrySyncToVars(SrcMask); + if (OutLinearBias.initialized()) + ins["OutLinearBias"] = egr::EagerUtils::TrySyncToVars(OutLinearBias); + if (Ln2Scale.initialized()) + ins["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale); + if (Ln2Bias.initialized()) + ins["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias); + + std::map>> outs = + {{"LnMean", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"LnVariance", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"LnOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"QKVOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"QKVBiasOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"TransposeOut2", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"QKOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"QKTVOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"SoftmaxOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"AttnDropoutMaskOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"AttnDropoutOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"SrcMaskOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"FMHAOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"OutLinearOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"DropoutMaskOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln2Mean", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln2Variance", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"BiasDropoutResidualOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"CacheKVOut", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Y", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}}; + + // Prepare Autograd Meta + egr::AutogradMeta* p_autograd_X = egr::EagerUtils::nullable_autograd_meta(X); + egr::AutogradMeta* p_autograd_LnScale = + egr::EagerUtils::nullable_autograd_meta(LnScale); + egr::AutogradMeta* p_autograd_LnBias = + egr::EagerUtils::nullable_autograd_meta(LnBias); + egr::AutogradMeta* p_autograd_QKVW = + egr::EagerUtils::nullable_autograd_meta(QKVW); + egr::AutogradMeta* p_autograd_QKVBias = + egr::EagerUtils::nullable_autograd_meta(QKVBias); + egr::AutogradMeta* p_autograd_CacheKV = + egr::EagerUtils::nullable_autograd_meta(CacheKV); + egr::AutogradMeta* p_autograd_SrcMask = + egr::EagerUtils::nullable_autograd_meta(SrcMask); + egr::AutogradMeta* p_autograd_OutLinearW = + egr::EagerUtils::nullable_autograd_meta(OutLinearW); + egr::AutogradMeta* p_autograd_OutLinearBias = + egr::EagerUtils::nullable_autograd_meta(OutLinearBias); + egr::AutogradMeta* p_autograd_Ln2Scale = + egr::EagerUtils::nullable_autograd_meta(Ln2Scale); + egr::AutogradMeta* p_autograd_Ln2Bias = + egr::EagerUtils::nullable_autograd_meta(Ln2Bias); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + + bool require_any_grad = + egr::EagerUtils::ComputeRequireGrad(trace_backward, + p_autograd_X, + p_autograd_LnScale, + p_autograd_LnBias, + p_autograd_QKVW, + p_autograd_QKVBias, + p_autograd_CacheKV, + p_autograd_SrcMask, + p_autograd_OutLinearW, + p_autograd_OutLinearBias, + p_autograd_Ln2Scale, + p_autograd_Ln2Bias); + + paddle::framework::AttributeMap attrs = attr_map; + paddle::framework::AttributeMap default_attrs; + egr::Controller::Instance().GetCurrentTracer()->TraceOp( + "fused_attention", + ins, + outs, + attrs, + egr::Controller::Instance().GetExpectedPlace(), + &default_attrs, + true, + {}); + + paddle::experimental::Tensor LnMean; + egr::EagerUtils::GetOutput(outs["LnMean"][0], &LnMean); + paddle::experimental::Tensor LnVariance; + egr::EagerUtils::GetOutput(outs["LnVariance"][0], &LnVariance); + paddle::experimental::Tensor LnOut; + egr::EagerUtils::GetOutput(outs["LnOut"][0], &LnOut); + paddle::experimental::Tensor QKVOut; + egr::EagerUtils::GetOutput(outs["QKVOut"][0], &QKVOut); + paddle::experimental::Tensor QKVBiasOut; + egr::EagerUtils::GetOutput(outs["QKVBiasOut"][0], &QKVBiasOut); + paddle::experimental::Tensor TransposeOut2; + egr::EagerUtils::GetOutput(outs["TransposeOut2"][0], &TransposeOut2); + paddle::experimental::Tensor QKOut; + egr::EagerUtils::GetOutput(outs["QKOut"][0], &QKOut); + paddle::experimental::Tensor QKTVOut; + egr::EagerUtils::GetOutput(outs["QKTVOut"][0], &QKTVOut); + paddle::experimental::Tensor SoftmaxOut; + egr::EagerUtils::GetOutput(outs["SoftmaxOut"][0], &SoftmaxOut); + paddle::experimental::Tensor AttnDropoutMaskOut; + egr::EagerUtils::GetOutput(outs["AttnDropoutMaskOut"][0], + &AttnDropoutMaskOut); + paddle::experimental::Tensor AttnDropoutOut; + egr::EagerUtils::GetOutput(outs["AttnDropoutOut"][0], &AttnDropoutOut); + paddle::experimental::Tensor SrcMaskOut; + egr::EagerUtils::GetOutput(outs["SrcMaskOut"][0], &SrcMaskOut); + paddle::experimental::Tensor FMHAOut; + egr::EagerUtils::GetOutput(outs["FMHAOut"][0], &FMHAOut); + paddle::experimental::Tensor OutLinearOut; + egr::EagerUtils::GetOutput(outs["OutLinearOut"][0], &OutLinearOut); + paddle::experimental::Tensor DropoutMaskOut; + egr::EagerUtils::GetOutput(outs["DropoutMaskOut"][0], &DropoutMaskOut); + paddle::experimental::Tensor Ln2Mean; + egr::EagerUtils::GetOutput(outs["Ln2Mean"][0], &Ln2Mean); + paddle::experimental::Tensor Ln2Variance; + egr::EagerUtils::GetOutput(outs["Ln2Variance"][0], &Ln2Variance); + paddle::experimental::Tensor BiasDropoutResidualOut; + egr::EagerUtils::GetOutput(outs["BiasDropoutResidualOut"][0], + &BiasDropoutResidualOut); + paddle::experimental::Tensor CacheKVOut; + egr::EagerUtils::GetOutput(outs["CacheKVOut"][0], &CacheKVOut); + paddle::experimental::Tensor Y; + egr::EagerUtils::GetOutput(outs["Y"][0], &Y); + + { + paddle::platform::RecordEvent node_creation_record_event( + "fused_attention node_creation", + paddle::platform::TracerEventType::Operator, + 1); + egr::AutogradMeta* p_autograd_LnMean = + egr::EagerUtils::autograd_meta(&LnMean); + egr::AutogradMeta* p_autograd_LnVariance = + egr::EagerUtils::autograd_meta(&LnVariance); + egr::AutogradMeta* p_autograd_LnOut = + egr::EagerUtils::autograd_meta(&LnOut); + egr::AutogradMeta* p_autograd_QKVOut = + egr::EagerUtils::autograd_meta(&QKVOut); + egr::AutogradMeta* p_autograd_QKVBiasOut = + egr::EagerUtils::autograd_meta(&QKVBiasOut); + egr::AutogradMeta* p_autograd_TransposeOut2 = + egr::EagerUtils::autograd_meta(&TransposeOut2); + egr::AutogradMeta* p_autograd_QKOut = + egr::EagerUtils::autograd_meta(&QKOut); + egr::AutogradMeta* p_autograd_QKTVOut = + egr::EagerUtils::autograd_meta(&QKTVOut); + egr::AutogradMeta* p_autograd_SoftmaxOut = + egr::EagerUtils::autograd_meta(&SoftmaxOut); + egr::AutogradMeta* p_autograd_AttnDropoutMaskOut = + egr::EagerUtils::autograd_meta(&AttnDropoutMaskOut); + egr::AutogradMeta* p_autograd_AttnDropoutOut = + egr::EagerUtils::autograd_meta(&AttnDropoutOut); + egr::AutogradMeta* p_autograd_SrcMaskOut = + egr::EagerUtils::autograd_meta(&SrcMaskOut); + egr::AutogradMeta* p_autograd_FMHAOut = + egr::EagerUtils::autograd_meta(&FMHAOut); + egr::AutogradMeta* p_autograd_OutLinearOut = + egr::EagerUtils::autograd_meta(&OutLinearOut); + egr::AutogradMeta* p_autograd_DropoutMaskOut = + egr::EagerUtils::autograd_meta(&DropoutMaskOut); + egr::AutogradMeta* p_autograd_Ln2Mean = + egr::EagerUtils::autograd_meta(&Ln2Mean); + egr::AutogradMeta* p_autograd_Ln2Variance = + egr::EagerUtils::autograd_meta(&Ln2Variance); + egr::AutogradMeta* p_autograd_BiasDropoutResidualOut = + egr::EagerUtils::autograd_meta(&BiasDropoutResidualOut); + egr::AutogradMeta* p_autograd_CacheKVOut = + egr::EagerUtils::autograd_meta(&CacheKVOut); + egr::AutogradMeta* p_autograd_Y = egr::EagerUtils::autograd_meta(&Y); + if (require_any_grad) { + VLOG(6) << " Construct Grad for fused_attention "; + egr::EagerUtils::PassStopGradient(false, + p_autograd_LnMean, + p_autograd_LnVariance, + p_autograd_LnOut, + p_autograd_QKVOut, + p_autograd_QKVBiasOut, + p_autograd_TransposeOut2, + p_autograd_QKOut, + p_autograd_QKTVOut, + p_autograd_SoftmaxOut, + p_autograd_AttnDropoutMaskOut, + p_autograd_AttnDropoutOut, + p_autograd_SrcMaskOut, + p_autograd_FMHAOut, + p_autograd_OutLinearOut, + p_autograd_DropoutMaskOut, + p_autograd_Ln2Mean, + p_autograd_Ln2Variance, + p_autograd_BiasDropoutResidualOut, + p_autograd_CacheKVOut, + p_autograd_Y); + // Create GradOpNode + auto grad_node = std::shared_ptr( + new fused_attentionGradNodeCompat(20, 23)); + + bool pre_layer_norm = false; + if (attrs.count("pre_layer_norm")) { + pre_layer_norm = BOOST_GET_CONST(bool, attrs.at("pre_layer_norm")); + } + + // Set Attributes + grad_node->SetAttrMap(std::move(attrs)); + grad_node->SetDefaultAttrMap(std::move(default_attrs)); + + grad_node->SetTensorWrapperX(X); + grad_node->SetTensorWrapperQKVW(QKVW); + grad_node->SetTensorWrapperOutLinearW(OutLinearW); + grad_node->SetTensorWrapperQKVOut(QKVOut); + grad_node->SetTensorWrapperTransposeOut2(TransposeOut2); + grad_node->SetTensorWrapperQKOut(QKOut); + grad_node->SetTensorWrapperQKTVOut(QKTVOut); + grad_node->SetTensorWrapperSoftmaxOut(SoftmaxOut); + grad_node->SetTensorWrapperAttnDropoutMaskOut(AttnDropoutMaskOut); + grad_node->SetTensorWrapperAttnDropoutOut(AttnDropoutOut); + grad_node->SetTensorWrapperFMHAOut(FMHAOut); + grad_node->SetTensorWrapperOutLinearOut(OutLinearOut); + grad_node->SetTensorWrapperDropoutMaskOut(DropoutMaskOut); + + grad_node->SetGradOutMeta(X, 0); + grad_node->SetGradOutMeta(QKVW, 3); + grad_node->SetGradOutMeta(OutLinearW, 7); + + if (QKVBias.initialized()) { + grad_node->SetTensorWrapperQKVBias(QKVBias); + grad_node->SetTensorWrapperQKVBiasOut(QKVBiasOut); + grad_node->SetGradOutMeta(QKVBias, 4); + + auto QKVBiasOut_accumulation_node = + std::make_shared(p_autograd_QKVBiasOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKVBiasOut, 0); + egr::EagerUtils::SetHistory(p_autograd_QKVBiasOut, + QKVBiasOut_accumulation_node); + QKVBiasOut_accumulation_node->SetGradInMeta(QKVBiasOut, 0); + egr::EagerUtils::CheckAndRetainGrad(QKVBiasOut); + grad_node->SetGradOutMeta(QKVBiasOut, 11); + } + + if (SrcMask.initialized()) { + grad_node->SetTensorWrapperSrcMask(SrcMask); + grad_node->SetTensorWrapperSrcMaskOut(SrcMaskOut); + + auto SrcMaskOut_accumulation_node = + std::make_shared(p_autograd_SrcMaskOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_SrcMaskOut, 0); + egr::EagerUtils::SetHistory(p_autograd_SrcMaskOut, + SrcMaskOut_accumulation_node); + SrcMaskOut_accumulation_node->SetGradInMeta(SrcMaskOut, 0); + egr::EagerUtils::CheckAndRetainGrad(SrcMaskOut); + grad_node->SetGradOutMeta(SrcMaskOut, 12); + } + + if (OutLinearBias.initialized()) { + grad_node->SetTensorWrapperOutLinearBias(OutLinearBias); + grad_node->SetGradOutMeta(OutLinearBias, 8); + } + + if (pre_layer_norm) { + if (LnScale.initialized()) { + grad_node->SetTensorWrapperLnScale(LnScale); + grad_node->SetGradOutMeta(LnScale, 1); + } + if (LnBias.initialized()) { + grad_node->SetTensorWrapperLnBias(LnBias); + grad_node->SetGradOutMeta(LnBias, 2); + } + if (LnOut.initialized()) { + grad_node->SetTensorWrapperLnOut(LnOut); + + auto LnOut_accumulation_node = + std::make_shared(p_autograd_LnOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_LnOut, 0); + egr::EagerUtils::SetHistory(p_autograd_LnOut, + LnOut_accumulation_node); + LnOut_accumulation_node->SetGradInMeta(LnOut, 0); + egr::EagerUtils::CheckAndRetainGrad(LnOut); + grad_node->SetGradOutMeta(LnOut, 13); + } + if (LnMean.initialized()) { + grad_node->SetTensorWrapperLnMean(LnMean); + } + if (LnVariance.initialized()) { + grad_node->SetTensorWrapperLnVariance(LnVariance); + } + } else { + if (Ln2Scale.initialized()) { + grad_node->SetTensorWrapperLn2Scale(Ln2Scale); + grad_node->SetGradOutMeta(Ln2Scale, 9); + } + if (Ln2Bias.initialized()) { + grad_node->SetTensorWrapperLn2Bias(Ln2Bias); + grad_node->SetGradOutMeta(Ln2Bias, 10); + } + grad_node->SetTensorWrapperBiasDropoutResidualOut( + BiasDropoutResidualOut); + grad_node->SetTensorWrapperLn2Mean(Ln2Mean); + grad_node->SetTensorWrapperLn2Variance(Ln2Variance); + + auto BiasDropoutResidualOut_accumulation_node = + std::make_shared( + p_autograd_BiasDropoutResidualOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_BiasDropoutResidualOut, + 0); + egr::EagerUtils::SetHistory(p_autograd_BiasDropoutResidualOut, + BiasDropoutResidualOut_accumulation_node); + BiasDropoutResidualOut_accumulation_node->SetGradInMeta( + BiasDropoutResidualOut, 0); + egr::EagerUtils::CheckAndRetainGrad(BiasDropoutResidualOut); + grad_node->SetGradOutMeta(BiasDropoutResidualOut, 14); + } + + egr::EagerUtils::SetOutRankWithSlot(p_autograd_LnMean, 0); + grad_node->SetGradInMeta(LnMean, 0); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_LnVariance, 1); + grad_node->SetGradInMeta(LnVariance, 1); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_AttnDropoutMaskOut, 9); + grad_node->SetGradInMeta(AttnDropoutMaskOut, 9); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_DropoutMaskOut, 14); + grad_node->SetGradInMeta(DropoutMaskOut, 14); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Mean, 15); + grad_node->SetGradInMeta(Ln2Mean, 15); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Variance, 16); + grad_node->SetGradInMeta(Ln2Variance, 16); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_CacheKVOut, 18); + egr::EagerUtils::SetHistory(p_autograd_CacheKVOut, grad_node); + grad_node->SetGradInMeta(CacheKVOut, 18); + egr::EagerUtils::CheckAndRetainGrad(CacheKVOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Y, 19); + egr::EagerUtils::SetHistory(p_autograd_Y, grad_node); + grad_node->SetGradInMeta(Y, 19); + egr::EagerUtils::CheckAndRetainGrad(Y); + + auto QKVOut_accumulation_node = + std::make_shared(p_autograd_QKVOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKVOut, 0); + egr::EagerUtils::SetHistory(p_autograd_QKVOut, QKVOut_accumulation_node); + QKVOut_accumulation_node->SetGradInMeta(QKVOut, 0); + egr::EagerUtils::CheckAndRetainGrad(QKVOut); + grad_node->SetGradOutMeta(QKVOut, 15); + + auto QKTVOut_accumulation_node = + std::make_shared(p_autograd_QKTVOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKTVOut, 0); + egr::EagerUtils::SetHistory(p_autograd_QKTVOut, + QKTVOut_accumulation_node); + QKTVOut_accumulation_node->SetGradInMeta(QKTVOut, 0); + egr::EagerUtils::CheckAndRetainGrad(QKTVOut); + grad_node->SetGradOutMeta(QKTVOut, 16); + + auto TransposeOut2_accumulation_node = + std::make_shared(p_autograd_TransposeOut2); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_TransposeOut2, 0); + egr::EagerUtils::SetHistory(p_autograd_TransposeOut2, + TransposeOut2_accumulation_node); + TransposeOut2_accumulation_node->SetGradInMeta(TransposeOut2, 0); + egr::EagerUtils::CheckAndRetainGrad(TransposeOut2); + grad_node->SetGradOutMeta(TransposeOut2, 17); + + auto QKOut_accumulation_node = + std::make_shared(p_autograd_QKOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKOut, 0); + egr::EagerUtils::SetHistory(p_autograd_QKOut, QKOut_accumulation_node); + QKOut_accumulation_node->SetGradInMeta(QKOut, 0); + egr::EagerUtils::CheckAndRetainGrad(QKOut); + grad_node->SetGradOutMeta(QKOut, 18); + + auto SoftmaxOut_accumulation_node = + std::make_shared(p_autograd_SoftmaxOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_SoftmaxOut, 0); + egr::EagerUtils::SetHistory(p_autograd_SoftmaxOut, + SoftmaxOut_accumulation_node); + SoftmaxOut_accumulation_node->SetGradInMeta(SoftmaxOut, 0); + egr::EagerUtils::CheckAndRetainGrad(SoftmaxOut); + grad_node->SetGradOutMeta(SoftmaxOut, 19); + + auto AttnDropoutOut_accumulation_node = + std::make_shared( + p_autograd_AttnDropoutOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_AttnDropoutOut, 0); + egr::EagerUtils::SetHistory(p_autograd_AttnDropoutOut, + AttnDropoutOut_accumulation_node); + AttnDropoutOut_accumulation_node->SetGradInMeta(AttnDropoutOut, 0); + egr::EagerUtils::CheckAndRetainGrad(AttnDropoutOut); + grad_node->SetGradOutMeta(AttnDropoutOut, 20); + + auto FMHAOut_accumulation_node = + std::make_shared(p_autograd_FMHAOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_FMHAOut, 0); + egr::EagerUtils::SetHistory(p_autograd_FMHAOut, + FMHAOut_accumulation_node); + FMHAOut_accumulation_node->SetGradInMeta(FMHAOut, 0); + egr::EagerUtils::CheckAndRetainGrad(FMHAOut); + grad_node->SetGradOutMeta(FMHAOut, 21); + + auto OutLinearOut_accumulation_node = + std::make_shared(p_autograd_OutLinearOut); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_OutLinearOut, 0); + egr::EagerUtils::SetHistory(p_autograd_OutLinearOut, + OutLinearOut_accumulation_node); + OutLinearOut_accumulation_node->SetGradInMeta(OutLinearOut, 0); + egr::EagerUtils::CheckAndRetainGrad(OutLinearOut); + grad_node->SetGradOutMeta(OutLinearOut, 22); + } + } + + return std::make_tuple(LnMean, + LnVariance, + LnOut, + QKVOut, + QKVBiasOut, + TransposeOut2, + QKOut, + QKTVOut, + SoftmaxOut, + AttnDropoutMaskOut, + AttnDropoutOut, + SrcMaskOut, + FMHAOut, + OutLinearOut, + DropoutMaskOut, + Ln2Mean, + Ln2Variance, + BiasDropoutResidualOut, + CacheKVOut, + Y); +} diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt index 4eaa43a4b51c6..28c034e8b5ddb 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt @@ -8,6 +8,11 @@ cc_library( SRCS fused_feedforward_node.cc DEPS ${eager_deps} ${fluid_deps}) +cc_library( + fused_attention_node + SRCS fused_attention_node.cc + DEPS ${eager_deps} ${fluid_deps}) + set(fluid_manual_nodes - fused_gate_attention_node fused_feedforward_node + fused_gate_attention_node fused_feedforward_node fused_attention_node PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc new file mode 100644 index 0000000000000..990cfb5226dbb --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc @@ -0,0 +1,366 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/phi/api/all.h" + +paddle::small_vector, + egr::kSlotSmallVectorSize> +fused_attentionGradNodeCompat::operator()( + paddle::small_vector, + egr::kSlotSmallVectorSize>& grads, + bool create_graph, + bool is_new_grad) { + VLOG(3) << "Running Eager Backward Node: fused_attentionGradNodeCompat"; + const auto& out_metas = OutputMeta(); + paddle::small_vector, + egr::kSlotSmallVectorSize> + outputs(23); + paddle::small_vector, + egr::kSlotSmallVectorSize> + hooked_grads0 = fused_attentionGradNodeCompat::ApplyGradientHooks(grads); + + bool pre_layer_norm = false; + if (attr_map_.count("pre_layer_norm")) { + pre_layer_norm = BOOST_GET_CONST(bool, attr_map_.at("pre_layer_norm")); + } + + std::map>> ins0 = + {{"AttnDropoutMaskOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->AttnDropoutMaskOut_))}, + {"AttnDropoutOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->AttnDropoutOut_))}, + {"DropoutMaskOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->DropoutMaskOut_))}, + {"FMHAOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->FMHAOut_))}, + {"OutLinearOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearOut_))}, + {"OutLinearW", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearW_))}, + {"QKOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->QKOut_))}, + {"QKTVOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->QKTVOut_))}, + {"QKVOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->QKVOut_))}, + {"QKVW", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->QKVW_))}, + {"SoftmaxOut", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->SoftmaxOut_))}, + {"TransposeOut2", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->TransposeOut2_))}, + {"X", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->X_))}, + {"Y@GRAD", egr::EagerUtils::TrySyncToVars(hooked_grads0[19])}}; + std::map>> outs0; + + if ((!out_metas[7].empty()) && (!(out_metas[7][0].IsStopGradient()))) { + outs0.insert({"OutLinearW@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if ((!out_metas[3].empty()) && (!(out_metas[3][0].IsStopGradient()))) { + outs0.insert({"QKVW@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) { + outs0.insert({"X@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + + auto QKVOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKVOut_); + if (QKVOut.defined() && (!out_metas[15].empty()) && + (!out_metas[15][0].IsStopGradient())) + outs0["QKVOut@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + auto QKTVOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKTVOut_); + if (QKTVOut.defined() && (!out_metas[16].empty()) && + (!out_metas[16][0].IsStopGradient())) + outs0["QKTVOut@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + auto TransposeOut2 = + egr::EagerUtils::RecoverTensorWrapper(&this->TransposeOut2_); + if (TransposeOut2.defined() && (!out_metas[17].empty()) && + (!out_metas[17][0].IsStopGradient())) + outs0["TransposeOut2@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + auto QKOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKOut_); + if (QKOut.defined() && (!out_metas[18].empty()) && + (!out_metas[18][0].IsStopGradient())) + outs0["QKOut@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + auto SoftmaxOut = egr::EagerUtils::RecoverTensorWrapper(&this->SoftmaxOut_); + if (SoftmaxOut.defined() && (!out_metas[19].empty()) && + (!out_metas[19][0].IsStopGradient())) + outs0["SoftmaxOut@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + auto AttnDropoutOut = + egr::EagerUtils::RecoverTensorWrapper(&this->AttnDropoutOut_); + if (AttnDropoutOut.defined() && (!out_metas[20].empty()) && + (!out_metas[20][0].IsStopGradient())) + outs0["AttnDropoutOut@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + auto FMHAOut = egr::EagerUtils::RecoverTensorWrapper(&this->FMHAOut_); + if (FMHAOut.defined() && (!out_metas[21].empty()) && + (!out_metas[21][0].IsStopGradient())) + outs0["FMHAOut@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + auto OutLinearOut = + egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearOut_); + if (OutLinearOut.defined() && (!out_metas[22].empty()) && + (!out_metas[22][0].IsStopGradient())) + outs0["OutLinearOut@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + + auto QKVBias = egr::EagerUtils::RecoverTensorWrapper(&this->QKVBias_); + if (QKVBias.defined()) { + ins0["QKVBias"] = egr::EagerUtils::TrySyncToVars(QKVBias); + auto QKVBiasOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKVBiasOut_); + ins0["QKVBiasOut"] = egr::EagerUtils::TrySyncToVars(QKVBiasOut); + if (QKVBias.defined() && (!out_metas[4].empty()) && + (!out_metas[4][0].IsStopGradient())) + outs0["QKVBias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + if (QKVBiasOut.defined() && (!out_metas[11].empty()) && + (!out_metas[11][0].IsStopGradient())) + outs0["QKVBiasOut@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto SrcMask = egr::EagerUtils::RecoverTensorWrapper(&this->SrcMask_); + if (SrcMask.defined()) { + ins0["SrcMask"] = egr::EagerUtils::TrySyncToVars(SrcMask); + auto SrcMaskOut = egr::EagerUtils::RecoverTensorWrapper(&this->SrcMaskOut_); + ins0["SrcMaskOut"] = egr::EagerUtils::TrySyncToVars(SrcMaskOut); + if (SrcMaskOut.defined() && (!out_metas[12].empty()) && + (!out_metas[12][0].IsStopGradient())) + outs0["SrcMaskOut@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto OutLinearBias = + egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearBias_); + if (OutLinearBias.defined()) { + ins0["OutLinearBias"] = egr::EagerUtils::TrySyncToVars(OutLinearBias); + if (OutLinearBias.defined() && (!out_metas[8].empty()) && + (!out_metas[8][0].IsStopGradient())) + outs0["OutLinearBias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + if (pre_layer_norm) { + auto LnScale = egr::EagerUtils::RecoverTensorWrapper(&this->LnScale_); + if (LnScale.defined()) { + ins0["LnScale"] = egr::EagerUtils::TrySyncToVars(LnScale); + if (LnScale.defined() && (!out_metas[1].empty()) && + (!out_metas[1][0].IsStopGradient())) + outs0["LnScale@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto LnBias = egr::EagerUtils::RecoverTensorWrapper(&this->LnBias_); + if (LnBias.defined()) { + ins0["LnBias"] = egr::EagerUtils::TrySyncToVars(LnBias); + if (LnBias.defined() && (!out_metas[2].empty()) && + (!out_metas[2][0].IsStopGradient())) + outs0["LnBias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto LnOut = egr::EagerUtils::RecoverTensorWrapper(&this->LnOut_); + if (LnOut.defined()) { + ins0["LnOut"] = egr::EagerUtils::TrySyncToVars(LnOut); + if (LnOut.defined() && (!out_metas[13].empty()) && + (!out_metas[13][0].IsStopGradient())) + outs0["LnOut@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto LnMean = egr::EagerUtils::RecoverTensorWrapper(&this->LnMean_); + if (LnMean.defined()) { + ins0["LnMean"] = egr::EagerUtils::TrySyncToVars(LnMean); + } + + auto LnVariance = egr::EagerUtils::RecoverTensorWrapper(&this->LnVariance_); + if (LnVariance.defined()) { + ins0["LnVariance"] = egr::EagerUtils::TrySyncToVars(LnVariance); + } + } else { + auto Ln2Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Scale_); + if (Ln2Scale.defined()) { + ins0["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale); + if (Ln2Scale.defined() && (!out_metas[9].empty()) && + (!out_metas[9][0].IsStopGradient())) + outs0["Ln2Scale@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto Ln2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Bias_); + if (Ln2Bias.defined()) { + ins0["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias); + if (Ln2Bias.defined() && (!out_metas[10].empty()) && + (!out_metas[10][0].IsStopGradient())) + outs0["Ln2Bias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + auto BiasDropoutResidualOut = + egr::EagerUtils::RecoverTensorWrapper(&this->BiasDropoutResidualOut_); + auto Ln2Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Mean_); + auto Ln2Variance = + egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Variance_); + ins0["BiasDropoutResidualOut"] = + egr::EagerUtils::TrySyncToVars(BiasDropoutResidualOut); + ins0["Ln2Mean"] = egr::EagerUtils::TrySyncToVars(Ln2Mean); + ins0["Ln2Variance"] = egr::EagerUtils::TrySyncToVars(Ln2Variance); + if (BiasDropoutResidualOut.defined() && (!out_metas[14].empty()) && + (!out_metas[14][0].IsStopGradient())) + outs0["BiasDropoutResidualOut@GRAD"] = { + std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto& attrs_map0 = this->attr_map_; + // Pass the entire attribute map to TraceOp + // The underlying kernel will pickup whatever attribute they need at runtime + egr::Controller::Instance().GetCurrentTracer()->TraceOp( + "fused_attention_grad", + ins0, + outs0, + attrs_map0, + egr::Controller::Instance().GetExpectedPlace(), + &this->default_attr_map_, + false, + {}); + + if (outs0.find("OutLinearW@GRAD") != outs0.end()) { + outputs[7] = egr::EagerUtils::GetOutputs(outs0["OutLinearW@GRAD"]); + } + if (outs0.find("QKVW@GRAD") != outs0.end()) { + outputs[3] = egr::EagerUtils::GetOutputs(outs0["QKVW@GRAD"]); + } + if (outs0.find("X@GRAD") != outs0.end()) { + outputs[0] = egr::EagerUtils::GetOutputs(outs0["X@GRAD"]); + } + + if (outs0.find("QKVOut@GRAD") != outs0.end()) { + outputs[15] = egr::EagerUtils::GetOutputs(outs0["QKVOut@GRAD"]); + } + if (outs0.find("QKTVOut@GRAD") != outs0.end()) { + outputs[16] = egr::EagerUtils::GetOutputs(outs0["QKTVOut@GRAD"]); + } + if (outs0.find("TransposeOut2@GRAD") != outs0.end()) { + outputs[17] = egr::EagerUtils::GetOutputs(outs0["TransposeOut2@GRAD"]); + } + if (outs0.find("QKOut@GRAD") != outs0.end()) { + outputs[18] = egr::EagerUtils::GetOutputs(outs0["QKOut@GRAD"]); + } + if (outs0.find("SoftmaxOut@GRAD") != outs0.end()) { + outputs[19] = egr::EagerUtils::GetOutputs(outs0["SoftmaxOut@GRAD"]); + } + if (outs0.find("AttnDropoutOut@GRAD") != outs0.end()) { + outputs[20] = egr::EagerUtils::GetOutputs(outs0["AttnDropoutOut@GRAD"]); + } + if (outs0.find("FMHAOut@GRAD") != outs0.end()) { + outputs[21] = egr::EagerUtils::GetOutputs(outs0["FMHAOut@GRAD"]); + } + if (outs0.find("OutLinearOut@GRAD") != outs0.end()) { + outputs[22] = egr::EagerUtils::GetOutputs(outs0["OutLinearOut@GRAD"]); + } + + if (QKVBias.defined()) { + if (outs0.find("QKVBias@GRAD") != outs0.end()) { + outputs[4] = egr::EagerUtils::GetOutputs(outs0["QKVBias@GRAD"]); + } + if (outs0.find("QKVBiasOut@GRAD") != outs0.end()) { + outputs[11] = egr::EagerUtils::GetOutputs(outs0["QKVBiasOut@GRAD"]); + } + } + + if (SrcMask.defined()) { + if (outs0.find("SrcMaskOut@GRAD") != outs0.end()) { + outputs[12] = egr::EagerUtils::GetOutputs(outs0["SrcMaskOut@GRAD"]); + } + } + + if (OutLinearBias.defined()) { + if (outs0.find("OutLinearBias@GRAD") != outs0.end()) { + outputs[8] = egr::EagerUtils::GetOutputs(outs0["OutLinearBias@GRAD"]); + } + } + + if (pre_layer_norm) { + auto LnScale = egr::EagerUtils::RecoverTensorWrapper(&this->LnScale_); + if (LnScale.defined()) { + if (outs0.find("LnScale@GRAD") != outs0.end()) { + outputs[1] = egr::EagerUtils::GetOutputs(outs0["LnScale@GRAD"]); + } + } + + auto LnBias = egr::EagerUtils::RecoverTensorWrapper(&this->LnBias_); + if (LnBias.defined()) { + if (outs0.find("LnBias@GRAD") != outs0.end()) { + outputs[2] = egr::EagerUtils::GetOutputs(outs0["LnBias@GRAD"]); + } + } + + auto LnOut = egr::EagerUtils::RecoverTensorWrapper(&this->LnOut_); + if (LnOut.defined()) { + if (outs0.find("LnOut@GRAD") != outs0.end()) { + outputs[13] = egr::EagerUtils::GetOutputs(outs0["LnOut@GRAD"]); + } + } + } else { + auto Ln2Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Scale_); + if (Ln2Scale.defined()) { + if (outs0.find("Ln2Scale@GRAD") != outs0.end()) { + outputs[9] = egr::EagerUtils::GetOutputs(outs0["Ln2Scale@GRAD"]); + } + } + + auto Ln2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Bias_); + if (Ln2Bias.defined()) { + if (outs0.find("Ln2Bias@GRAD") != outs0.end()) { + outputs[10] = egr::EagerUtils::GetOutputs(outs0["Ln2Bias@GRAD"]); + } + } + if (outs0.find("BiasDropoutResidualOut@GRAD") != outs0.end()) { + outputs[14] = + egr::EagerUtils::GetOutputs(outs0["BiasDropoutResidualOut@GRAD"]); + } + } + + if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs); + return outputs; +} diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h index 52d3b44d7ba2a..571deb4e9ca74 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h @@ -329,3 +329,205 @@ class fused_feedforwardGradNodeCompat : public egr::GradNodeBase { paddle::framework::AttributeMap attr_map_; paddle::framework::AttributeMap default_attr_map_; }; + +class fused_attentionGradNodeCompat : public egr::GradNodeBase { + public: + fused_attentionGradNodeCompat() : egr::GradNodeBase() { + VLOG(7) << " Construct fused_attentionGradNodeCompat "; + } + fused_attentionGradNodeCompat(size_t bwd_in_slot_num, size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { + VLOG(7) << " Construct fused_attentionGradNodeCompat "; + } + ~fused_attentionGradNodeCompat() override { + VLOG(6) << " Destruct fused_attentionGradNodeCompat "; + } + + virtual paddle::small_vector, + egr::kSlotSmallVectorSize> + operator()( + paddle::small_vector, // NOLINT + egr::kSlotSmallVectorSize>& grads, // NOLINT + bool create_graph = false, + bool is_new_grad = false) override; + + void ClearTensorWrappers() override { + AttnDropoutMaskOut_.clear(); + AttnDropoutOut_.clear(); + BiasDropoutResidualOut_.clear(); + DropoutMaskOut_.clear(); + FMHAOut_.clear(); + Ln2Bias_.clear(); + Ln2Mean_.clear(); + Ln2Scale_.clear(); + Ln2Variance_.clear(); + OutLinearBias_.clear(); + OutLinearOut_.clear(); + OutLinearW_.clear(); + QKOut_.clear(); + QKTVOut_.clear(); + QKVBias_.clear(); + QKVBiasOut_.clear(); + QKVOut_.clear(); + QKVW_.clear(); + SoftmaxOut_.clear(); + SrcMask_.clear(); + SrcMaskOut_.clear(); + TransposeOut2_.clear(); + X_.clear(); + + SetIsTensorWrappersCleared(true); + } + std::string name() override { return "fused_attentionGradNodeCompat"; } + + std::shared_ptr Copy() const override { + { + auto copied_node = std::shared_ptr( + new fused_attentionGradNodeCompat(*this)); + return copied_node; + } + } + + // SetX, SetY, ... + void SetTensorWrapperAttnDropoutMaskOut( + const paddle::experimental::Tensor& AttnDropoutMaskOut) { + AttnDropoutMaskOut_ = egr::TensorWrapper(AttnDropoutMaskOut, false); + } + void SetTensorWrapperAttnDropoutOut( + const paddle::experimental::Tensor& AttnDropoutOut) { + AttnDropoutOut_ = egr::TensorWrapper(AttnDropoutOut, false); + } + void SetTensorWrapperBiasDropoutResidualOut( + const paddle::experimental::Tensor& BiasDropoutResidualOut) { + BiasDropoutResidualOut_ = egr::TensorWrapper(BiasDropoutResidualOut, false); + } + void SetTensorWrapperDropoutMaskOut( + const paddle::experimental::Tensor& DropoutMaskOut) { + DropoutMaskOut_ = egr::TensorWrapper(DropoutMaskOut, false); + } + void SetTensorWrapperFMHAOut(const paddle::experimental::Tensor& FMHAOut) { + FMHAOut_ = egr::TensorWrapper(FMHAOut, false); + } + void SetTensorWrapperLn2Bias(const paddle::experimental::Tensor& Ln2Bias) { + Ln2Bias_ = egr::TensorWrapper(Ln2Bias, false); + } + void SetTensorWrapperLn2Mean(const paddle::experimental::Tensor& Ln2Mean) { + Ln2Mean_ = egr::TensorWrapper(Ln2Mean, false); + } + void SetTensorWrapperLn2Scale(const paddle::experimental::Tensor& Ln2Scale) { + Ln2Scale_ = egr::TensorWrapper(Ln2Scale, false); + } + void SetTensorWrapperLn2Variance( + const paddle::experimental::Tensor& Ln2Variance) { + Ln2Variance_ = egr::TensorWrapper(Ln2Variance, false); + } + void SetTensorWrapperOutLinearBias( + const paddle::experimental::Tensor& OutLinearBias) { + OutLinearBias_ = egr::TensorWrapper(OutLinearBias, false); + } + void SetTensorWrapperOutLinearOut( + const paddle::experimental::Tensor& OutLinearOut) { + OutLinearOut_ = egr::TensorWrapper(OutLinearOut, false); + } + void SetTensorWrapperOutLinearW( + const paddle::experimental::Tensor& OutLinearW) { + OutLinearW_ = egr::TensorWrapper(OutLinearW, false); + } + void SetTensorWrapperQKOut(const paddle::experimental::Tensor& QKOut) { + QKOut_ = egr::TensorWrapper(QKOut, false); + } + void SetTensorWrapperQKTVOut(const paddle::experimental::Tensor& QKTVOut) { + QKTVOut_ = egr::TensorWrapper(QKTVOut, false); + } + void SetTensorWrapperQKVBias(const paddle::experimental::Tensor& QKVBias) { + QKVBias_ = egr::TensorWrapper(QKVBias, false); + } + void SetTensorWrapperQKVBiasOut( + const paddle::experimental::Tensor& QKVBiasOut) { + QKVBiasOut_ = egr::TensorWrapper(QKVBiasOut, false); + } + void SetTensorWrapperQKVOut(const paddle::experimental::Tensor& QKVOut) { + QKVOut_ = egr::TensorWrapper(QKVOut, false); + } + void SetTensorWrapperQKVW(const paddle::experimental::Tensor& QKVW) { + QKVW_ = egr::TensorWrapper(QKVW, false); + } + void SetTensorWrapperSoftmaxOut( + const paddle::experimental::Tensor& SoftmaxOut) { + SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false); + } + void SetTensorWrapperSrcMask(const paddle::experimental::Tensor& SrcMask) { + SrcMask_ = egr::TensorWrapper(SrcMask, false); + } + void SetTensorWrapperSrcMaskOut( + const paddle::experimental::Tensor& SrcMaskOut) { + SrcMaskOut_ = egr::TensorWrapper(SrcMaskOut, false); + } + void SetTensorWrapperTransposeOut2( + const paddle::experimental::Tensor& TransposeOut2) { + TransposeOut2_ = egr::TensorWrapper(TransposeOut2, false); + } + void SetTensorWrapperX(const paddle::experimental::Tensor& X) { + X_ = egr::TensorWrapper(X, false); + } + void SetTensorWrapperLnScale(const paddle::experimental::Tensor& LnScale) { + LnScale_ = egr::TensorWrapper(LnScale, false); + } + void SetTensorWrapperLnBias(const paddle::experimental::Tensor& LnBias) { + LnBias_ = egr::TensorWrapper(LnBias, false); + } + void SetTensorWrapperLnOut(const paddle::experimental::Tensor& LnOut) { + LnOut_ = egr::TensorWrapper(LnOut, false); + } + void SetTensorWrapperLnMean(const paddle::experimental::Tensor& LnMean) { + LnMean_ = egr::TensorWrapper(LnMean, false); + } + void SetTensorWrapperLnVariance( + const paddle::experimental::Tensor& LnVariance) { + LnVariance_ = egr::TensorWrapper(LnVariance, false); + } + + // SetAttrMap + void SetAttrMap(paddle::framework::AttributeMap&& attr_map) { + attr_map_ = std::move(attr_map); + } + void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) { + default_attr_map_ = std::move(default_attr_map); + } + + private: + // TensorWrappers + egr::TensorWrapper AttnDropoutMaskOut_; + egr::TensorWrapper AttnDropoutOut_; + egr::TensorWrapper BiasDropoutResidualOut_; + egr::TensorWrapper DropoutMaskOut_; + egr::TensorWrapper FMHAOut_; + egr::TensorWrapper Ln2Bias_; + egr::TensorWrapper Ln2Mean_; + egr::TensorWrapper Ln2Scale_; + egr::TensorWrapper Ln2Variance_; + egr::TensorWrapper OutLinearBias_; + egr::TensorWrapper OutLinearOut_; + egr::TensorWrapper OutLinearW_; + egr::TensorWrapper QKOut_; + egr::TensorWrapper QKTVOut_; + egr::TensorWrapper QKVBias_; + egr::TensorWrapper QKVBiasOut_; + egr::TensorWrapper QKVOut_; + egr::TensorWrapper QKVW_; + egr::TensorWrapper SoftmaxOut_; + egr::TensorWrapper SrcMask_; + egr::TensorWrapper SrcMaskOut_; + egr::TensorWrapper TransposeOut2_; + egr::TensorWrapper X_; + + egr::TensorWrapper LnScale_; + egr::TensorWrapper LnBias_; + egr::TensorWrapper LnOut_; + egr::TensorWrapper LnMean_; + egr::TensorWrapper LnVariance_; + + // Attribute Map + paddle::framework::AttributeMap attr_map_; + paddle::framework::AttributeMap default_attr_map_; +}; diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 6eb35eb13f3f7..1b3c7fd8e4649 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -51,8 +51,10 @@ static std::unordered_set ops_to_fill_zero_for_empty_grads = { "split", "rnn"}; /* --- Black Ops list that's NO NEED to apply code generation --- */ -static std::unordered_set black_ops_list = { - "run_program", "fused_gate_attention", "fused_feedforward"}; +static std::unordered_set black_ops_list = {"run_program", + "fused_gate_attention", + "fused_feedforward", + "fused_attention"}; static std::string LegalizeVariableName(const std::string& var_name) { std::string ret = var_name; diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py index 6507cc1ee3258..1ad29ecadd7bf 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py @@ -26,9 +26,7 @@ from paddle.fluid import layers import unittest from op_test import OpTest -from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph - -_enable_legacy_dygraph() +from paddle.fluid.framework import default_main_program default_main_program().random_seed = 42 diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py index 0aad7ec7581e9..8b8d378e5c8fa 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py @@ -26,11 +26,9 @@ from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float from test_sparse_attention_op import get_cuda_version from paddle import _C_ops -from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph +from paddle.fluid.framework import default_main_program from paddle.fluid import core -_enable_legacy_dygraph() - @unittest.skipIf(not core.is_compiled_with_cuda(), "Paddle is not compiled with CUDA") From 6984fbca87ffab9659bc411bac756c2c83f369f9 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Thu, 7 Jul 2022 13:09:25 +0800 Subject: [PATCH 091/250] [IPU] support dy2static for IPU merge code (#43770) * feat(): dynamic_to_static support for ipu. * fix(): format fix. * fix format * fix cpplint error * use phi::errors * fix format * fix format * fix(): add api to restore patched function. * fix(): identity_loss uses cpu place as expected kernel type. * doc(): add IPU dy2static related docs. * fix(): combine test cases. * fix format * fix comment * fix format * apply comment * fix compiling * fix(): align docs. * fix(): fix identity_loss function docs. * fix(): adjust mean and sum in identity_loss. * fix(): minor docs. * move API to paddle.incubate.identity_loss * fix UT Co-authored-by: zhaorui chen --- paddle/fluid/framework/executor_cache.cc | 4 + .../ir/ipu/ipu_runtime_replacer_pass.cc | 3 + .../ir/ipu/optimizer_extract_pass.cc | 13 + paddle/fluid/framework/parallel_executor.cc | 9 + paddle/fluid/imperative/prepared_operator.cc | 10 + paddle/fluid/imperative/tracer.cc | 9 + paddle/fluid/operators/identity_loss_op.cc | 108 +++++++ .../fluid/platform/device/ipu/ipu_compiler.cc | 4 +- .../fluid/platform/device/ipu/ipu_executor.cc | 9 +- .../fluid/platform/device/ipu/ipu_strategy.cc | 4 + .../fluid/platform/device/ipu/ipu_strategy.h | 6 + .../ipu/popart_canonicalization/other_ops.cc | 12 + .../device/ipu/supported_ops_custom.h | 1 + paddle/fluid/platform/device_context.cc | 2 + paddle/fluid/pybind/imperative.cc | 52 ++- paddle/phi/infermeta/unary.cc | 12 + paddle/phi/infermeta/unary.h | 2 + .../kernels/cpu/identity_loss_grad_kernel.cc | 59 ++++ .../phi/kernels/cpu/identity_loss_kernel.cc | 54 ++++ .../phi/kernels/identity_loss_grad_kernel.h | 30 ++ paddle/phi/kernels/identity_loss_kernel.h | 29 ++ paddle/phi/ops/compat/identity_loss_sig.cc | 34 ++ python/paddle/fluid/compiler.py | 303 ++++++++++++++++++ python/paddle/fluid/layers/loss.py | 57 ++++ .../unittests/ipu/test_dy2static_fp16_ipu.py | 137 ++++++++ .../tests/unittests/ipu/test_dy2static_ipu.py | 193 +++++++++++ .../unittests/ipu/test_identity_loss_ipu.py | 105 ++++++ .../tests/unittests/test_identity_loss_op.py | 188 +++++++++++ python/paddle/incubate/__init__.py | 3 + 29 files changed, 1445 insertions(+), 7 deletions(-) create mode 100644 paddle/fluid/operators/identity_loss_op.cc create mode 100644 paddle/phi/kernels/cpu/identity_loss_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/identity_loss_kernel.cc create mode 100644 paddle/phi/kernels/identity_loss_grad_kernel.h create mode 100644 paddle/phi/kernels/identity_loss_kernel.h create mode 100644 paddle/phi/ops/compat/identity_loss_sig.cc create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_identity_loss_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/test_identity_loss_op.py diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc index bac3ce5c4f88e..2df86e86a75e0 100644 --- a/paddle/fluid/framework/executor_cache.cc +++ b/paddle/fluid/framework/executor_cache.cc @@ -46,6 +46,10 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) { execution_strategy.num_threads_ = 1; break; } + case platform::DeviceType::IPU: { + execution_strategy.num_threads_ = 1; + break; + } default: PADDLE_THROW(platform::errors::Unavailable("Unsupported Device type %d.", device_type)); diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc index 65ebd3ec8080d..3dc9f3d10d920 100644 --- a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc +++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc @@ -37,6 +37,9 @@ void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const { ipu_rt_op_desc.SetInput("FeedList", feed_list); ipu_rt_op_desc.SetOutput("FetchList", fetch_list); ipu_rt_op_desc.Flush(); + // set op_role to avoid program.clone failure + ipu_rt_op_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + {static_cast(framework::OpRole::kForward)}); // Create a new node for the ipu_runtime_op. auto* ipu_rt_node = graph->CreateOpNode(&ipu_rt_op_desc); diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index b45a39aaa8680..f28696194e5f6 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -287,6 +287,19 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { } else if (op_role == OpRole::kLRSched) { // op_role == OpRole::kLRSched | OpRole::kOptimize new_op.SetAttr("with_lr_sched", true); + } else if (op_type == "identity_loss") { + auto outputs = op->Outputs(); + PADDLE_ENFORCE_EQ( + outputs.size(), + 1, + platform::errors::InvalidArgument("Can only support one loss key")); + auto losses = outputs.begin()->second; + PADDLE_ENFORCE_EQ( + losses.size(), + 1, + platform::errors::InvalidArgument("Can only support one loss name")); + auto loss_var = losses.front(); + new_op.SetAttr("loss_var", loss_var); } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 697cb8cdcf6e8..4e6aeaeb7ac6a 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -548,6 +548,15 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't use XPU device since it's not compiled with XPU," "Please recompile or reinstall Paddle with XPU support.")); +#endif + } else if (platform::is_ipu_place(place)) { +#if defined(PADDLE_WITH_IPU) + gc.reset(new IPUGarbageCollector(place, max_memory_size)); + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use IPU device since it's not compiled with IPU," + "Please recompile or reinstall Paddle with IPU support.")); #endif } else if (platform::is_custom_place(place)) { #if defined(PADDLE_WITH_CUSTOM_DEVICE) diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 950a66d5e6d68..029c01a245b1e 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -394,6 +394,16 @@ PreparedOp PrepareImpl( kernel_iter = kernels.find(expected_kernel_key); } #endif +#ifdef PADDLE_WITH_IPU + if (kernel_iter == kernels.end() && + paddle::platform::is_ipu_place(expected_kernel_key.place_)) { + VLOG(3) << "missing IPU kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } +#endif #ifdef PADDLE_WITH_MLU if (kernel_iter == kernels.end() && paddle::platform::is_mlu_place(expected_kernel_key.place_)) { diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 26a5c5adfd666..4c99bfc248e88 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -140,6 +140,15 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't use NPU device since it's not compiled with NPU," "Please recompile or reinstall Paddle with NPU support.")); +#endif + } else if (platform::is_ipu_place(place)) { +#if defined(PADDLE_WITH_IPU) + gc.reset(new framework::IPUGarbageCollector(place, 0)); + VLOG(10) << "Created GarbageCollector at " << place; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use IPU device since it's not compiled with IPU," + "Please recompile or reinstall Paddle with IPU support.")); #endif } else if (platform::is_mlu_place(place)) { #if defined(PADDLE_WITH_MLU) diff --git a/paddle/fluid/operators/identity_loss_op.cc b/paddle/fluid/operators/identity_loss_op.cc new file mode 100644 index 0000000000000..bc9986c7ffea1 --- /dev/null +++ b/paddle/fluid/operators/identity_loss_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace paddle { +namespace operators { + +class IdentityLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), + platform::CPUPlace()); + } +}; + +class IdentityLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) The input of identity_loss op"); + AddOutput("Out", "(Tensor) The output of identity_loss op"); + AddAttr("reduction", "(int, default 1). The reduction.") + .SetDefault(1) + .InEnum({0, 1, 2}); + AddComment(R"DOC( +IdentityLoss Operator mark the Loss var. + +)DOC"); + } +}; + +class IdentityLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + return framework::OpKernelType(input_data_type, platform::CPUPlace()); + } +}; + +template +class IdentityLossGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("identity_loss_grad"); + grad_op->SetInput("X", this->Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_INPLACE_OP_INFERER(IdentityLossInplaceInferer, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(IdentityLossGradInplaceInferer, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(identity_loss, + IdentityLossInferShapeFunctor, + PD_INFER_META(phi::IdentityLossInferMeta)); + +REGISTER_OPERATOR(identity_loss, + ops::IdentityLossOp, + ops::IdentityLossOpMaker, + ops::IdentityLossGradMaker, + ops::IdentityLossGradMaker, + ops::IdentityLossInplaceInferer, + IdentityLossInferShapeFunctor); + +REGISTER_OPERATOR(identity_loss_grad, + ops::IdentityLossGradOp, + ops::IdentityLossGradInplaceInferer); diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index 74b216f4e0f58..930af7e1470fc 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -535,7 +535,9 @@ void Compiler::LowerOptimizer(const Scope* scope) { resources_->loss_var = resources_->tensors[loss_var]; resources_->with_lr_sched = BOOST_GET_CONST(bool, op_desc->GetAttr("with_lr_sched")); - if (op_desc->HasAttr("lr_var")) { + if (ipu_strategy_->is_dynamic) { + resources_->lr = ipu_strategy_->lr; + } else if (op_desc->HasAttr("lr_var")) { auto lr_var = BOOST_GET_CONST(std::string, op_desc->GetAttr("lr_var")); resources_->lr_var = lr_var; resources_->lr = GetSingleVarFromScope(scope, lr_var); diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc index 3cd4a12b378a3..cf051f978208d 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.cc +++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc @@ -213,8 +213,13 @@ void Executor::Run(const std::vector &inputs, optimizer = compiler_resources_->eval_optimizer.get(); } else { VLOG(10) << "Update learning_rate"; - auto new_lr = - GetSingleVarFromScope(scope_, compiler_resources_->lr_var); + float new_lr; + if (ipu_strategy_->is_dynamic) { + new_lr = ipu_strategy_->lr; + } else { + new_lr = + GetSingleVarFromScope(scope_, compiler_resources_->lr_var); + } VLOG(10) << "New Lr: " << new_lr; optimizer = compiler_resources_->UpdateOptimizer(new_lr); } diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc index e7d53c751f2b9..d796501069651 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -101,6 +101,10 @@ IpuStrategy::IpuStrategy() { ADD_STRING_OPTION(onnx_dump_path); ADD_STRING_OPTION(weight_decay_mode); + // dy2static support + ADD_DOUBLE_OPTION(lr); + ADD_BOOL_OPTION(is_dynamic); + #undef ADD_STRING_OPTION #undef ADD_DOUBLE_OPTION #undef ADD_UINT64_OPTION diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index 9ae54108ac528..997bc310df308 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -112,6 +112,12 @@ class IpuStrategy { // Custom ops std::vector custom_ops; + // lr for dynamic2static + float lr = 0.0; + + // whether in dynamic mode + bool is_dynamic = false; + public: void AddBoolOption(const std::string &option, bool value); void AddUint64Option(const std::string &option, std::uint64_t value); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc index 0b95f641695c1..1e9291cf57256 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc @@ -85,6 +85,17 @@ Node *identity_handler(Graph *graph, Node *node) { graph, node, "popart_identity", node->inputs, node->outputs); } +Node *identity_loss_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction")); + return CreateBaseOp(graph, + node, + "popart_identity_loss", + node->inputs, + node->outputs, + {{"reduction", reduction}}); +} + Node *detach_handler(Graph *graph, Node *node) { return CreateBaseOp( graph, node, "popart_detach_v2", node->inputs, node->outputs); @@ -101,4 +112,5 @@ REGISTER_HANDLER(popart_optimizer, popart_optimizer_handler); REGISTER_HANDLER(checkpointoutput, checkpointoutput_handler); REGISTER_HANDLER(custom_nll_loss, custom_nll_loss_handler); REGISTER_HANDLER(identity, identity_handler); +REGISTER_HANDLER(identity_loss, identity_loss_handler); REGISTER_HANDLER(detach, detach_handler); diff --git a/paddle/fluid/platform/device/ipu/supported_ops_custom.h b/paddle/fluid/platform/device/ipu/supported_ops_custom.h index 02d215433c5ee..04c57cc0104de 100644 --- a/paddle/fluid/platform/device/ipu/supported_ops_custom.h +++ b/paddle/fluid/platform/device/ipu/supported_ops_custom.h @@ -17,5 +17,6 @@ #pragma once OP_DECL(popart_nllloss_v2, aiGraphcoreOpset.nllloss, SIG_ARG(INT32,popart::ReductionType,reduction) OPT_ARG(INT32,ignoreIndex) ARG(BOOL,inputIsLogProbability) ) // NOLINT +OP_DECL(popart_identity_loss, aiGraphcoreOpset.identityloss, SIG_ARG(INT32,popart::ReductionType,reduction) ) // NOLINT // clang-format on diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 4dfeca3bd1325..a668d7f4b8366 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -123,6 +123,8 @@ DeviceType Place2DeviceType(const platform::Place& place) { return platform::DeviceType::CUDA; } else if (platform::is_xpu_place(place)) { return platform::DeviceType::XPU; + } else if (platform::is_ipu_place(place)) { + return platform::DeviceType::IPU; } else if (platform::is_mlu_place(place)) { return platform::DeviceType::MLU; } else { diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index ab9fb236dbbcc..569890fa25cd6 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -142,6 +142,8 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { return place_obj.cast(); } else if (py::isinstance(place_obj)) { return place_obj.cast(); + } else if (py::isinstance(place_obj)) { + return place_obj.cast(); } else if (py::isinstance(place_obj)) { return place_obj.cast(); } else if (py::isinstance(place_obj)) { @@ -151,8 +153,8 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " - "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace/" - "CustomPlace")); + "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/IPUPlace/" + "MLUPlace/CustomPlace")); } } @@ -198,6 +200,8 @@ static void InitVarBaseAndTensor(imperative::VarBase *self, tensor, array, place, zero_copy); } else if (platform::is_npu_place(place)) { SetTensorFromPyArray(tensor, array, place, zero_copy); + } else if (platform::is_ipu_place(place)) { + SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_mlu_place(place)) { SetTensorFromPyArray(tensor, array, place, zero_copy); } else if (platform::is_custom_place(place)) { @@ -206,7 +210,8 @@ static void InitVarBaseAndTensor(imperative::VarBase *self, } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " - "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/MLUPlace")); + "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace/IPUPlace/" + "MLUPlace")); } self->SetDataType(framework::TransToProtoVarType(tensor->dtype())); } @@ -1856,6 +1861,18 @@ void BindImperative(py::module *m_ptr) { return new_var; }, py::return_value_policy::copy) + .def( + "_copy_to", + [](const std::shared_ptr &self, + const platform::IPUPlace &place, + bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) .def( "_copy_to", [](const std::shared_ptr &self, @@ -2140,6 +2157,11 @@ void BindImperative(py::module *m_ptr) { self.SetExpectedPlace(*p); VLOG(4) << "Tracer(" << &self << ")" << " set expected place " << *p; + } else if (py::isinstance(obj)) { + auto p = obj.cast(); + self.SetExpectedPlace(*p); + VLOG(4) << "Tracer(" << &self << ")" + << " set expected place " << *p; } else if (py::isinstance(obj)) { auto p = obj.cast(); self.SetExpectedPlace(*p); @@ -2158,7 +2180,7 @@ void BindImperative(py::module *m_ptr) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "Incompatible Place Type: supports XPUPlace, CUDAPlace, " - "CPUPlace, NPUPlace, MLUPlace" + "CPUPlace, NPUPlace, IPUPlace, MLUPlace" "and CUDAPinnedPlace, " "but got Unknown Type!")); } @@ -2313,6 +2335,28 @@ void BindImperative(py::module *m_ptr) { inplace_map); } }) + .def("trace", + [](imperative::Tracer &self, + const std::string &type, + const PyNameVarBaseMap &ins, + const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, + const platform::IPUPlace &place, + bool trace_backward, + const std::map &inplace_map = {}) { + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); + { + py::gil_scoped_release release; + self.TraceOp(type, + std::move(ins_map), + std::move(outs_map), + std::move(attrs), + place, + trace_backward, + inplace_map); + } + }) .def("trace", [](imperative::Tracer &self, const std::string &type, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 072ab6fd68a1a..9c5286c066a2b 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -3262,6 +3262,18 @@ void ChannelShuffleInferMeta(const MetaTensor& x, out->set_dims(output_dims); } +void IdentityLossInferMeta(const MetaTensor& x, + int reduction, + MetaTensor* out) { + if (reduction == 2) { + out->set_dtype(x.dtype()); + out->set_dims(x.dims()); + } else { + out->set_dims(phi::make_ddim({1})); + out->set_dtype(x.dtype()); + } +} + } // namespace phi PD_REGISTER_INFER_META_FN(flatten, phi::FlattenInferMeta); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index f64d406e019ce..591fb9553a1eb 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -469,4 +469,6 @@ void ChannelShuffleInferMeta(const MetaTensor& x, const std::string& data_format, MetaTensor* out); +void IdentityLossInferMeta(const MetaTensor& x, int reduction, MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/identity_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/identity_loss_grad_kernel.cc new file mode 100644 index 0000000000000..f26195b5069b6 --- /dev/null +++ b/paddle/phi/kernels/cpu/identity_loss_grad_kernel.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/identity_loss_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/mean_all_grad_kernel.h" +#include "paddle/phi/kernels/reduce_sum_grad_kernel.h" + +namespace phi { + +template +void IdentityLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const int reduction, + DenseTensor* x_grad) { + switch (reduction) { + case 0: + // sum + phi::ReduceSumGradKernel( + dev_ctx, x, out_grad, std::vector{0}, false, true, x_grad); + break; + case 1: + // mean + phi::MeanAllGradKernel(dev_ctx, x, out_grad, x_grad); + break; + case 2: + // none + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + break; + default: + // error + PADDLE_THROW(phi::errors::InvalidArgument( + "reduction should be 0, 1 and 2. But get %d", reduction)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(identity_loss_grad, + CPU, + ALL_LAYOUT, + phi::IdentityLossGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/identity_loss_kernel.cc b/paddle/phi/kernels/cpu/identity_loss_kernel.cc new file mode 100644 index 0000000000000..941174eb5b0bd --- /dev/null +++ b/paddle/phi/kernels/cpu/identity_loss_kernel.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/identity_loss_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/mean_all_kernel.h" +#include "paddle/phi/kernels/reduce_sum_kernel.h" + +namespace phi { + +template +void IdentityLossKernel(const Context& dev_ctx, + const DenseTensor& x, + const int reduction, + DenseTensor* out) { + switch (reduction) { + case 0: + // sum + phi::SumRawKernel( + dev_ctx, x, std::vector{0}, false, true, out->dtype(), out); + break; + case 1: + // mean + phi::MeanAllKernel(dev_ctx, x, out); + break; + case 2: + // none + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + break; + default: + // error + PADDLE_THROW(phi::errors::InvalidArgument( + "reduction should be 0, 1 and 2. But get %d", reduction)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + identity_loss, CPU, ALL_LAYOUT, phi::IdentityLossKernel, float, double) {} diff --git a/paddle/phi/kernels/identity_loss_grad_kernel.h b/paddle/phi/kernels/identity_loss_grad_kernel.h new file mode 100644 index 0000000000000..02422fd936bda --- /dev/null +++ b/paddle/phi/kernels/identity_loss_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void IdentityLossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out_grad, + const int reduction, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/identity_loss_kernel.h b/paddle/phi/kernels/identity_loss_kernel.h new file mode 100644 index 0000000000000..895b565894b22 --- /dev/null +++ b/paddle/phi/kernels/identity_loss_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void IdentityLossKernel(const Context& dev_ctx, + const DenseTensor& x, + const int reduction, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/identity_loss_sig.cc b/paddle/phi/ops/compat/identity_loss_sig.cc new file mode 100644 index 0000000000000..aa9516bd1ec4f --- /dev/null +++ b/paddle/phi/ops/compat/identity_loss_sig.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature IdentityLossOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("identity_loss", {"X"}, {"reduction"}, {"Out"}); +} + +KernelSignature IdentityLossGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "identity_loss_grad", {"X", "Out@GRAD"}, {"reduction"}, {"X@GRAD"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(identity_loss, phi::IdentityLossOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(identity_loss_grad, + phi::IdentityLossGradOpArgumentMapping); diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index 38393311de5f8..11ccd476b1b59 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -506,6 +506,192 @@ def _get_places(self, place, place_list): return place_list +class IpuDynamicPatcher(object): + """ + Patcher for IPU dynamic2static support. + """ + + patcher_cache = [] + + def __init__(self): + pass + + @staticmethod + def convert_concrete_program(ipu_strategy, + concrete_program, + class_instance=None): + """ + Convert the ConcreteProgram to IPUConcreteProgram. + """ + from ..fluid.dygraph.base import switch_to_static_graph + from ..fluid import backward + from ..fluid.initializer import Constant + from ..fluid.framework import device_guard + import paddle + + inputs = concrete_program.inputs + outputs = concrete_program.outputs + startup_program = concrete_program.startup_program + + scope = paddle.static.global_scope() + + @switch_to_static_graph + def append_backward_desc(): + program = concrete_program.main_program + + # backward with optimizer to add backward graph to program + backward.gradients_with_optimizer(program, ipu_strategy._optimizer) + + # initialize backward parameters + exe = paddle.static.Executor(paddle.CPUPlace()) + startup_program = paddle.static.default_startup_program() + exe.run(startup_program) + + return program + + if ipu_strategy.enable_fp16: + class_instance.to(dtype="float16") + + # copy the bias and filters + for param_or_buffer in concrete_program.parameters: + param_or_buffer_tensor = scope.var( + param_or_buffer.name).get_tensor() + src_tensor = param_or_buffer.value().get_tensor() + param_or_buffer_tensor._share_data_with(src_tensor) + + # TODO(czr): feed and fetch list needs to consider more type + if class_instance: + feed_list = [elem.name for elem in inputs[1:] if elem is not None] + else: + feed_list = [elem.name for elem in inputs if elem is not None] + fetch_list = [elem.name for elem in outputs] + + if ipu_strategy.is_training: + concrete_program.main_program = append_backward_desc() + # copy optimizer parameters + optimizer = ipu_strategy._optimizer + for k, v in optimizer._accumulators.items(): + for param_name, var_tmp in v.items(): + var = optimizer.helper.create_global_variable( + name=var_tmp.name, + persistable=True, + dtype=var_tmp.dtype, + type=var_tmp.type, + shape=var_tmp.shape, + belong_to_optimizer=True) + device = optimizer._get_device_for_param(param_name) + with device_guard(device): + optimizer.helper.set_variable_initializer( + var, initializer=Constant(value=0.0)) + param_or_lr_tensor = scope.find_var( + var_tmp.name).get_tensor() + optim_tensor = var.value().get_tensor() + param_or_lr_tensor._share_data_with(optim_tensor) + optimizer._accumulators[k][param_name] = var + + @switch_to_static_graph + def func_compile(): + if ipu_strategy.enable_fp16: + amp_list = paddle.static.amp.CustomOpLists() + amp_list.unsupported_list = {"cumsum"} + to_fp16_var_names = paddle.static.amp.cast_model_to_fp16( + concrete_program.main_program, + amp_list, + use_fp16_guard=False) + paddle.static.amp.cast_parameters_to_fp16( + paddle.CPUPlace(), + concrete_program.main_program, + to_fp16_var_names=to_fp16_var_names) + + program = IpuCompiledProgram(concrete_program.main_program, + ipu_strategy=ipu_strategy, + scope=scope).compile( + feed_list, fetch_list) + return program + + main_program = func_compile() + concrete_program.main_program = main_program + return concrete_program + + @staticmethod + def patch_program_cache(ipu_strategy): + """ Monkey patch ProgramCache discriptor to support dynamic2static in IPU. + + Args: + ipu_strategy: The ipu_strategy used in dynamic graph. + + Returns: + None + """ + from ..fluid.dygraph.dygraph_to_static.program_translator import ProgramCache + from ..fluid.dygraph.dygraph_to_static.program_translator import CacheKey + from ..fluid.dygraph.dygraph_to_static import logging_utils + from ..fluid.dygraph.dygraph_to_static.program_translator import MAX_TRACED_PROGRAM_COUNT + from ..fluid.dygraph.dygraph_to_static.partial_program import partial_program_from + + old_getter = ProgramCache.__getitem__ + + def patch_getter(self, item): + if not isinstance(item, CacheKey): + raise ValueError( + 'type(item) should be CacheKey, but received %s' % + type_name(item)) + item_id = hash(item) + self._recent_key = item_id + if item_id not in self._caches or ipu_strategy.need_compile: + if item_id in self._caches: + logging_utils.warn( + "ipu_strategy chances detected. Please sync weights.") + if self._caches and not ipu_strategy.need_compile: + logging_utils.warn( + "dynamic2static on IPU doesn't support mutiple caches. Please make sure" + "dynamic inputs is not used.") + concrete_program, _ = self._build_once(item) + concrete_program = IpuDynamicPatcher.convert_concrete_program( + ipu_strategy, concrete_program, item.class_instance) + + self._caches[item_id] = (concrete_program, + partial_program_from(concrete_program)) + # Note: raise warnings if number of traced program is more than `max_tracing_count` + current_tracing_count = len(self._caches) + if current_tracing_count > MAX_TRACED_PROGRAM_COUNT: + logging_utils.warn( + "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. " + "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors." + .format(current_tracing_count, + MAX_TRACED_PROGRAM_COUNT)) + + return self._caches[item_id] + + setattr(ProgramCache, '__getitem__', patch_getter) + IpuDynamicPatcher.patcher_cache.append( + [ProgramCache, '__getitem__', old_getter]) + + @staticmethod + def patch_lr_scheduler(ipu_strategy): + from paddle.optimizer.lr import LRScheduler + # For IPU dynamic graph usage, lr_var is not synced in executor as static mode do. + # Manually set lr to ipu_strategy to update the lr. + old_step = LRScheduler.step + + def patch_step(self, epoch=None): + old_step(self, epoch) + ipu_strategy.set_options({"lr": self.last_lr}) + + setattr(LRScheduler, 'step', patch_step) + IpuDynamicPatcher.patcher_cache.append([LRScheduler, 'step', old_step]) + + @staticmethod + def register_patch(ipu_strategy): + IpuDynamicPatcher.patch_program_cache(ipu_strategy) + IpuDynamicPatcher.patch_lr_scheduler(ipu_strategy) + + @staticmethod + def release_patch(): + for module, key, attr in IpuDynamicPatcher.patcher_cache: + setattr(module, key, attr) + + class IpuStrategy(object): """ Help users precisely control the graph building in :code:`paddle.static.IpuCompiledProgram` . @@ -542,10 +728,121 @@ def __init__(self): self._ipu_strategy.set_options(default_options) self.has_custom_ops = False self.custom_op_names = [] + self.need_compile = True else: raise RuntimeError( "Can not use IpuStrategy in non IPU compiled environment, please re-compile with WITH_IPU=ON." ) + from paddle import in_dynamic_mode + if in_dynamic_mode(): + self.register_patch() + + def register_patch(self): + """ + Register patchs function to support dynamic to static on IPU. This operation would break the dy2static functionality on CPU. + Use `release_patch` to release the patch. + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + import paddle.static as static + + ipu_strategy = static.IpuStrategy() + + ipu_strategy.register_patch() + """ + IpuDynamicPatcher.register_patch(self) + + def release_patch(self): + """ + Release the registered IPU functions. + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + import paddle.static as static + + ipu_strategy = static.IpuStrategy() + + ipu_strategy.release_patch() + """ + IpuDynamicPatcher.release_patch() + + def set_optimizer(self, optimizer): + """ + Set optimizer to ipu_strategy in dynamic mode. + + Args: + optimizer (Optimizer): Optimizer to be used in training. + + Returns: + None. + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + import paddle.static as static + + linear = paddle.nn.Linear(10, 10) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=linear.parameters()) + ipu_strategy = static.IpuStrategy() + ipu_strategy.set_optimizer(optimizer) + """ + from paddle import in_dynamic_mode + if in_dynamic_mode(): + self._optimizer = optimizer + optimizer_attrs = self.parse_optimizer(optimizer) + self._ipu_strategy.set_options(optimizer_attrs) + else: + raise RuntimeError("Only needs to set optimizer in dynamic mode.") + + def parse_optimizer(self, optimizer): + """ + Parse optimizer attributes for IPU dynamic to static support. Currently only support parse lr. + + Args: + optimizer (Optimizer): Optimizer to be parsed. + + Returns: + Dict. + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + import paddle.static as static + + linear = paddle.nn.Linear(10, 10) + optimizer = paddle.optimizer.SGD(learning_rate=0.01, + parameters=linear.parameters()) + ipu_strategy = static.IpuStrategy() + attrs = ipu_strategy.parse_optimizer(optimizer) + """ + + def get_lr(): + from paddle.optimizer.lr import LRScheduler + if isinstance(optimizer._learning_rate, float): + return {"lr": optimizer._learning_rate} + elif isinstance(optimizer._learning_rate, LRScheduler): + return {"lr": optimizer._learning_rate()} + + attr_fn = [get_lr] + optimizer_attrs = {"is_dynamic": True} + for fn in attr_fn: + optimizer_attrs.update(fn()) + return optimizer_attrs def set_graph_config(self, num_ipus=1, @@ -743,6 +1040,10 @@ def set_options(self, options): ipu_strategy.set_options(options) """ self._ipu_strategy.set_options(options) + # check whether to recompile program with updated ipu options. + recompile_white_list = {'lr'} + if options.keys() - recompile_white_list: + self.need_compile = True def get_option(self, option): """ @@ -1050,4 +1351,6 @@ def compile(self, feed_list, fetch_list): if not hasattr(program, 'org_program'): program.org_program = self._program + self._ipu_strategy.need_compile = False + return program diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py index 1ad4e3c4298c2..00c2aa56fa3e0 100644 --- a/python/paddle/fluid/layers/loss.py +++ b/python/paddle/fluid/layers/loss.py @@ -1230,6 +1230,63 @@ def softmax_with_cross_entropy(logits, return_softmax, axis) +def identity_loss(x, reduction="none"): + r"""Marks a tensor as being part of the loss calculation for IPU. + + This operator is used to handle on the (final) loss of a model so that + it is used as the start of backpropagation. + + When `reduction` is `none`, return raw `Out`. + + When `reduction` is `mean`, return + + .. math:: + Out = MEAN(Out) + + When `reduction` is `sum`, return + + .. math:: + Out = SUM(Out) + + Parameters: + x (Variable): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of + additional dimensions. It's data type should be float32, float64 on CPU and float16, float32 on IPU. + reduction(str|int, optional): Reduce the loss output. Supported string values are: 'sum', 'mean', 'none' + the corresponding int values are 0, 1, 2 respectively. The default value is "none". + + Returns: + Variable: The loss ``Tensor`` with the specified reduction applied. + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + import paddle + paddle.enable_static() + loss = fluid.data(name="loss", shape=[-1, 1], dtype="float32") + out = paddle.incubate.identity_loss(loss, reduction=1) + """ + if isinstance(reduction, str): + reduction = {"sum": 0, "mean": 1, "none": 2}.get(reduction.lower()) + if reduction is None: + raise Exception("Unsupported reduction type.") + + if _non_static_mode(): + return _C_ops.identity_loss(x, "reduction", reduction) + + check_variable_and_dtype(x, 'x', ['float32', 'float64'], "identity_loss") + attrs = {'reduction': reduction} + helper = LayerHelper('identity_loss', **locals()) + dtype = helper.input_dtype(input_param_name='x') + out = helper.create_variable_for_type_inference(dtype) + helper.append_op(type="identity_loss", + inputs={"X": x}, + outputs={"Out": out}, + attrs=attrs) + return out + + def rank_loss(label, left, right, name=None): r""" diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py new file mode 100644 index 0000000000000..1484c9fdcb53e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py @@ -0,0 +1,137 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +import os +import paddle +import paddle.fluid as fluid +from paddle.jit import to_static +from paddle.utils.cpp_extension import load +from paddle.optimizer.lr import LRScheduler +import tempfile + +SEED = 2022 + + +class SimpleLayer(paddle.nn.Layer): + + def __init__(self, use_ipu=False): + super(SimpleLayer, self).__init__() + self.use_ipu = use_ipu + self.conv = paddle.nn.Conv2D(in_channels=3, + out_channels=1, + kernel_size=2, + stride=1) + + def forward(self, x, target=None): + x = self.conv(x) + x = paddle.fluid.layers.flatten(x, axis=1) + if target is not None: + x = paddle.fluid.layers.softmax(x) + loss = paddle.fluid.layers.cross_entropy(x, target) + if self.use_ipu: + loss = paddle.incubate.identity_loss(loss, 1) + else: + loss = paddle.mean(loss) + return x, loss + return x + + +class TestBase(unittest.TestCase): + + @classmethod + def setUpClass(cls): + paddle.disable_static() + cls.save_path = tempfile.TemporaryDirectory() + + @classmethod + def tearDownClass(cls): + cls.save_path.cleanup() + + def _test(self, use_ipu=False): + paddle.seed(SEED) + np.random.seed(SEED) + model = SimpleLayer(use_ipu) + specs = [ + paddle.static.InputSpec(name="x", + shape=[32, 3, 10, 10], + dtype="float32"), + paddle.static.InputSpec(name="target", shape=[32], dtype="int64"), + ] + model = paddle.jit.to_static(model, input_spec=specs) + optim = paddle.optimizer.Adam(learning_rate=0.01, + parameters=model.parameters()) + data = paddle.uniform((32, 3, 10, 10), dtype='float32') + label = paddle.randint(0, 10, shape=[32], dtype='int64') + model_path = '{}/model_state_dict_{}.pdparams'.format( + self.save_path, 'ipu' if use_ipu else 'cpu') + optim_path = '{}/optim_state_dict_{}.pdopt'.format( + self.save_path, 'ipu' if use_ipu else 'cpu') + + if use_ipu: + device = paddle.set_device('ipu') + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(num_ipus=1, + is_training=True, + micro_batch_size=1, + enable_manual_shard=False) + ipu_strategy.set_precision_config(enable_fp16=True) + ipu_strategy.set_optimizer(optim) + data = data.astype(np.float16) + + result = [] + for epoch in range(100): + # ipu only needs call model() to do forward/backward/grad_update + pred, loss = model(data, label) + if not use_ipu: + loss.backward() + optim.step() + optim.clear_grad() + + result.append(loss) + + if use_ipu: + paddle.fluid.core.IpuBackend.get_instance().weights_to_host() + + paddle.save(model.state_dict(), model_path) + paddle.save(optim.state_dict(), optim_path) + + model.set_state_dict(paddle.load(model_path)) + optim.set_state_dict(paddle.load(optim_path)) + + for epoch in range(100): + # ipu only needs call model() to do forward/backward/grad_update + pred, loss = model(data, label) + if not use_ipu: + loss.backward() + optim.step() + optim.clear_grad() + + result.append(loss) + + return np.array(result) + + def test_training(self): + cpu_loss = self._test(False).flatten() + ipu_loss = self._test(True).flatten() + + self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-2)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py new file mode 100644 index 0000000000000..28decc76a421c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py @@ -0,0 +1,193 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +import paddle +import paddle.fluid as fluid +from paddle.jit import to_static +from paddle.utils.cpp_extension import load +from paddle.optimizer.lr import LRScheduler +from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramCache +import tempfile + +SEED = 2022 + + +class SimpleLayer(paddle.nn.Layer): + + def __init__(self, use_ipu=False): + super(SimpleLayer, self).__init__() + self.use_ipu = use_ipu + self.conv = paddle.nn.Conv2D(in_channels=3, + out_channels=1, + kernel_size=2, + stride=1) + + @to_static() + def forward(self, x, target=None): + x = self.conv(x) + x = paddle.fluid.layers.flatten(x, axis=1) + if target is not None: + x = paddle.fluid.layers.softmax(x) + loss = paddle.fluid.layers.cross_entropy(x, target) + if self.use_ipu: + loss = paddle.incubate.identity_loss(loss, 1) + else: + loss = paddle.mean(loss) + return x, loss + return x + + +class TestBase(unittest.TestCase): + + @classmethod + def setUpClass(cls): + paddle.disable_static() + + def _test(self, use_ipu=False): + paddle.seed(SEED) + np.random.seed(SEED) + model = SimpleLayer(use_ipu) + optim = paddle.optimizer.Adam(learning_rate=0.01, + parameters=model.parameters()) + data = paddle.uniform((32, 3, 10, 10), dtype='float32') + label = paddle.randint(0, 10, shape=[32], dtype='int64') + + if use_ipu: + device = paddle.set_device('ipu') + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(num_ipus=1, + is_training=True, + micro_batch_size=1, + enable_manual_shard=False) + ipu_strategy.set_optimizer(optim) + + result = [] + for epoch in range(100): + # ipu only needs call model() to do forward/backward/grad_update + pred, loss = model(data, label) + if not use_ipu: + loss.backward() + optim.step() + optim.clear_grad() + + result.append(loss) + + if use_ipu: + ipu_strategy.release_patch() + + return np.array(result) + + def test_training(self): + ipu_loss = self._test(True).flatten() + cpu_loss = self._test(False).flatten() + + self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4)) + + +class TestSaveLoad(TestBase): + + @classmethod + def setUpClass(cls): + paddle.disable_static() + cls.save_path = tempfile.TemporaryDirectory() + + @classmethod + def tearDownClass(cls): + cls.save_path.cleanup() + + def _test(self, use_ipu=False): + paddle.seed(SEED) + np.random.seed(SEED) + model = SimpleLayer(use_ipu) + optim = paddle.optimizer.Adam(learning_rate=0.01, + parameters=model.parameters()) + data = paddle.uniform((32, 3, 10, 10), dtype='float32') + label = paddle.randint(0, 10, shape=[32], dtype='int64') + model_path = '{}/model_state_dict_{}.pdparams'.format( + self.save_path, 'ipu' if use_ipu else 'cpu') + optim_path = '{}/optim_state_dict_{}.pdopt'.format( + self.save_path, 'ipu' if use_ipu else 'cpu') + + if use_ipu: + device = paddle.set_device('ipu') + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(num_ipus=1, + is_training=True, + micro_batch_size=1, + enable_manual_shard=False) + ipu_strategy.set_optimizer(optim) + + result = [] + for epoch in range(100): + # ipu only needs call model() to do forward/backward/grad_update + pred, loss = model(data, label) + if not use_ipu: + loss.backward() + optim.step() + optim.clear_grad() + + result.append(loss) + + if use_ipu: + paddle.fluid.core.IpuBackend.get_instance().weights_to_host() + + paddle.save(model.state_dict(), model_path) + paddle.save(optim.state_dict(), optim_path) + + model.set_state_dict(paddle.load(model_path)) + optim.set_state_dict(paddle.load(optim_path)) + + for epoch in range(100): + # ipu only needs call model() to do forward/backward/grad_update + pred, loss = model(data, label) + if not use_ipu: + loss.backward() + optim.step() + optim.clear_grad() + + result.append(loss) + + if use_ipu: + ipu_strategy.release_patch() + + return np.array(result) + + +class TestPatch(unittest.TestCase): + + @classmethod + def setUpClass(cls): + paddle.disable_static() + + def test(self, use_ipu=False): + old_getter = ProgramCache.__getitem__ + old_step = LRScheduler.step + + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.release_patch() + + reset_getter = ProgramCache.__getitem__ + reset_step = LRScheduler.step + + self.assertTrue(reset_getter is old_getter) + self.assertTrue(reset_step is old_step) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_identity_loss_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_identity_loss_ipu.py new file mode 100644 index 0000000000000..9a44a9e7c306f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_identity_loss_ipu.py @@ -0,0 +1,105 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.compiler as compiler +import paddle.optimizer +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, + np_dtype_to_fluid_str) +from paddle.utils.cpp_extension import load + +paddle.enable_static() + + +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_feed() + self.set_feed_attr() + self.set_op() + + def set_op(self): + # setup custom op + self.op = paddle.incubate.identity_loss + + def set_feed(self): + self.feed = { + "x": np.random.uniform(low=-2, high=2, size=[3, + 5]).astype('float32'), + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + self.feed_dtype = [ + np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() + ] + + def _test_base(self, reduction): + scope = fluid.core.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + SEED = 0 + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + + with fluid.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype=self.feed_dtype[0]) + + out = self.op(x, reduction) + fetch_list = [out.name] + + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(num_ipus=1, is_training=False) + ipu_compiler = compiler.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy) + program = ipu_compiler.compile(feed_list, fetch_list) + + ipu_res = exe.run(program, self.feed, fetch_list) + + if reduction == 0: + # sum + cpu_res = self.feed['x'].sum() + elif reduction == 1: + # mean + cpu_res = self.feed['x'].mean() + else: + # none + cpu_res = self.feed['x'] + + self.assertTrue(np.allclose(ipu_res[0], cpu_res, atol=self.atol)) + + def test_base(self): + # TODO: use string instead of int for reduction + for reduction in [0, 1, 2]: + self._test_base(reduction) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_identity_loss_op.py b/python/paddle/fluid/tests/unittests/test_identity_loss_op.py new file mode 100644 index 0000000000000..3912fcafd52d0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_identity_loss_op.py @@ -0,0 +1,188 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard +from op_test import OpTest +from paddle.fluid.framework import _test_eager_guard + + +class TestIdentityLossOp(OpTest): + + def setUp(self): + self.max_relative_error = 0.006 + self.python_api = paddle.incubate.identity_loss + + self.inputs = {} + self.initTestCase() + self.dtype = np.float64 + + self.op_type = "identity_loss" + self.attrs = {} + self.attrs['reduction'] = self.reduction + + input = np.random.random(self.shape).astype(self.dtype) + + self.inputs['X'] = input + if self.reduction == 0: + output = input.sum() + elif self.reduction == 1: + output = input.mean() + else: + output = input + self.outputs = {'Out': output} + + def test_check_output(self): + paddle.enable_static() + self.check_output(check_eager=True) + paddle.disable_static() + + def test_check_grad_normal(self): + paddle.enable_static() + self.check_grad(['X'], 'Out', check_eager=True) + paddle.disable_static() + + def initTestCase(self): + self.shape = (4, 10, 10) + self.reduction = 0 + + +class TestCase1(TestIdentityLossOp): + + def initTestCase(self): + self.shape = (8, 16, 8) + self.reduction = 0 + + +class TestCase2(TestIdentityLossOp): + + def initTestCase(self): + self.shape = (8, 16) + self.reduction = 1 + + +class TestCase3(TestIdentityLossOp): + + def initTestCase(self): + self.shape = (4, 8, 16) + self.reduction = 2 + + +class TestIdentityLossFloat32(TestIdentityLossOp): + + def set_attrs(self): + self.dtype = 'float32' + + +class TestIdentityLossOpError(unittest.TestCase): + + def test_errors(self): + paddle.enable_static() + with program_guard(Program(), Program()): + input_data = np.random.random((2, 4)).astype("float32") + + def test_int(): + paddle.incubate.identity_loss(x=input_data, reduction=3) + + self.assertRaises(Exception, test_int) + + def test_string(): + paddle.incubate.identity_loss(x=input_data, + reduction="wrongkey") + + self.assertRaises(Exception, test_string) + + def test_dtype(): + x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32') + paddle.incubate.identity_loss(x=x2, reduction=1) + + self.assertRaises(TypeError, test_dtype) + paddle.disable_static() + + +class TestIdentityLossAPI(unittest.TestCase): + + def setUp(self): + self.x_shape = [2, 3, 4, 5] + self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32) + self.place = fluid.CPUPlace() + + def identity_loss_ref(self, input, reduction): + if reduction == 0 or reduction == "sum": + return input.sum() + elif reduction == 1 or reduction == "mean": + return input.mean() + else: + return input + + def test_api_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data('X', self.x_shape) + out1 = paddle.incubate.identity_loss(x) + out2 = paddle.incubate.identity_loss(x, reduction=0) + out3 = paddle.incubate.identity_loss(x, reduction=1) + out4 = paddle.incubate.identity_loss(x, reduction=2) + + exe = paddle.static.Executor(self.place) + res = exe.run(feed={'X': self.x}, + fetch_list=[out1, out2, out3, out4]) + ref = [ + self.identity_loss_ref(self.x, 2), + self.identity_loss_ref(self.x, 0), + self.identity_loss_ref(self.x, 1), + self.identity_loss_ref(self.x, 2) + ] + for out, out_ref in zip(res, ref): + self.assertEqual(np.allclose(out, out_ref, rtol=1e-04), True) + + def test_api_dygraph(self): + paddle.disable_static(self.place) + + def test_case(x, reduction): + x_tensor = paddle.to_tensor(x) + out = paddle.incubate.identity_loss(x_tensor, reduction) + out_ref = self.identity_loss_ref(x, reduction) + self.assertEqual(np.allclose(out.numpy(), out_ref, rtol=1e-04), + True) + + test_case(self.x, 0) + test_case(self.x, 1) + test_case(self.x, 2) + test_case(self.x, "sum") + test_case(self.x, "mean") + test_case(self.x, "none") + paddle.enable_static() + + def test_errors(self): + paddle.disable_static() + x = np.random.uniform(-1, 1, [10, 12]).astype('float32') + x = paddle.to_tensor(x) + self.assertRaises(Exception, paddle.incubate.identity_loss, x, -1) + self.assertRaises(Exception, paddle.incubate.identity_loss, x, 3) + self.assertRaises(Exception, paddle.incubate.identity_loss, x, + "wrongkey") + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.fluid.data('X', [10, 12], 'int32') + self.assertRaises(TypeError, paddle.incubate.identity_loss, x) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py index bab1d92a83659..8a28b65f696b3 100644 --- a/python/paddle/incubate/__init__.py +++ b/python/paddle/incubate/__init__.py @@ -35,6 +35,8 @@ from . import nn #noqa: F401 from . import asp #noqa: F401 +from ..fluid.layers.loss import identity_loss + from ..fluid.incubate import fleet __all__ = [ @@ -50,4 +52,5 @@ 'segment_mean', 'segment_max', 'segment_min', + 'identity_loss', ] From 7e3833a7ab419ae2f89e2b70577ef8e07d5b00e3 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Thu, 7 Jul 2022 13:26:27 +0800 Subject: [PATCH 092/250] update license, test=document_fix (#44144) --- paddle/fluid/platform/device_context_test_cuda_graph.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/device_context_test_cuda_graph.cu b/paddle/fluid/platform/device_context_test_cuda_graph.cu index 9f5a551743ed1..efb0d9ed75689 100644 --- a/paddle/fluid/platform/device_context_test_cuda_graph.cu +++ b/paddle/fluid/platform/device_context_test_cuda_graph.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From db2c71a41d499d49f2238f5b8c88fa2ad5d2be64 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Thu, 7 Jul 2022 13:31:23 +0800 Subject: [PATCH 093/250] [AutoParallel] fix 'op_role' for gradient merge & recompute (#44138) * fix op_role * fix engine * update op_role --- .../distributed/auto_parallel/engine.py | 205 ++++++++---------- .../dist_check_finite_and_unscale.py | 6 +- .../auto_parallel/operators/dist_embedding.py | 28 ++- .../auto_parallel/operators/dist_matmul.py | 54 +++-- .../distributed/auto_parallel/partitioner.py | 8 +- .../paddle/distributed/auto_parallel/utils.py | 15 +- .../distributed/passes/auto_parallel_amp.py | 4 +- .../passes/auto_parallel_gradient_merge.py | 46 ++-- 8 files changed, 182 insertions(+), 184 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 5e4a8c7d04033..3d5b91cd7faa7 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -18,7 +18,6 @@ import paddle import paddle.utils as utils -import paddle.distributed.auto_parallel as auto from paddle import fluid, static from paddle.io import Dataset @@ -72,7 +71,6 @@ def __init__(self, self._saver = DistributedSaver() self._logger = get_logger(logging.INFO) - self._default_strategy = None self._orig_main_prog = static.default_main_program() self._orig_startup_prog = static.default_startup_program() self._orig_dist_context = get_default_distributed_context() @@ -117,9 +115,11 @@ def prepare(self, self._planned_mode = None self._modes = ['train', 'eval', 'predict'] - self._build() - # Do auto parallel process + # Build program and do auto parallel process + for mode in self._modes: + # Build forward program + self._build(mode) for mode in self._modes: # Do the planning process self._plan(mode) @@ -129,56 +129,49 @@ def prepare(self, # Init comm and startup program self._initialize(mode) - def _build(self): - for mode in self._modes: - serial_main_prog = self._serial_main_progs.get(mode, None) - if serial_main_prog is not None: - return - - losses = [] - metrics = [] - serial_main_prog = self._orig_main_prog.clone() - serial_startup_prog = self._orig_startup_prog.clone() - with static.program_guard(serial_main_prog, serial_startup_prog), \ - utils.unique_name.guard(): - inputs_spec = self.inputs_spec - labels_spec = self.labels_spec if self.labels_spec else [] - inputs = [s._create_feed_layer() for s in inputs_spec] - labels = [s._create_feed_layer() for s in labels_spec] - outputs = to_list(self.model(*inputs)) - if mode != "predict" and self._loss: - losses = to_list(self._loss(*(outputs + labels))) - - if mode != "predict": - for metric in self._metrics: - metrics.extend( - to_list(metric.compute(*(outputs + labels)))) - - default_ctx = get_default_distributed_context() - if not default_ctx.has_annotation or self._default_strategy: - # We build the world process group because the data parallel - # needs all ranks by default. - new_process_group(list(range(self._nranks))) - default_ctx.data_parallel = True - - # self._feed_vars[mode] = {"inputs": inputs, "labels": labels} - feed_vars = {"inputs": inputs, "labels": labels} - - # self._fetch_vars[mode] = { - # "outputs": flatten(outputs), - # "loss": losses, - # "metrics": metrics - # } - fetch_vars = { - "outputs": flatten(outputs), - "loss": losses, - "metrics": metrics - } - - self._dist_contexts[mode] = DistributedContext( - serial_main_prog, serial_startup_prog, self._optimizer, losses, - feed_vars, fetch_vars, self.cluster, self.strategy) - self._dist_contexts[mode].gradient_scale = self._gradient_scale + def _build(self, mode): + + serial_main_prog = self._serial_main_progs.get(mode, None) + if serial_main_prog is not None: + return + + losses = [] + metrics = [] + serial_main_prog = self._orig_main_prog.clone() + serial_startup_prog = self._orig_startup_prog.clone() + with static.program_guard(serial_main_prog, serial_startup_prog), \ + utils.unique_name.guard(): + inputs_spec = self.inputs_spec + labels_spec = self.labels_spec if self.labels_spec else [] + inputs = [s._create_feed_layer() for s in inputs_spec] + labels = [s._create_feed_layer() for s in labels_spec] + outputs = to_list(self.model(*inputs)) + if mode != "predict" and self._loss: + losses = to_list(self._loss(*(outputs + labels))) + + if mode != "predict": + for metric in self._metrics: + metrics.extend(to_list(metric.compute(*(outputs + labels)))) + + default_ctx = get_default_distributed_context() + if not default_ctx.has_annotation: + # We build the world process group because the data parallel + # needs all ranks by default. + new_process_group(list(range(self._nranks))) + default_ctx.data_parallel = True + + feed_vars = {"inputs": inputs, "labels": labels} + + fetch_vars = { + "outputs": flatten(outputs), + "loss": losses, + "metrics": metrics + } + + self._dist_contexts[mode] = DistributedContext( + serial_main_prog, serial_startup_prog, self._optimizer, losses, + feed_vars, fetch_vars, self.cluster, self.strategy) + self._dist_contexts[mode].gradient_scale = self._gradient_scale def _plan(self, mode): if self._planned_mode is None: @@ -240,7 +233,6 @@ def _initialize(self, mode): continue process_group.instantiate() - # initialize self._place = _get_device() if isinstance(self._place, fluid.CUDAPlace): self._place = fluid.CUDAPlace(ParallelEnv().dev_id) @@ -273,8 +265,8 @@ def fit(self, train_dataloader = self._create_dataloader(train_data, batch_size, epochs, steps_per_epoch) - usr_fetch = self._to_map_fetch(fetches) - fetch_loss = self._inner_fetch(self.fetch_vars["loss"]) + usr_fetch = self._validate_fetches(fetches) + fetch_loss = self._validate_fetches(self.fetch_vars["loss"]) fetch_list, fetch_map = self._fetch_map(fetch_loss, usr_fetch) for epoch in range(epochs): @@ -292,8 +284,7 @@ def fit(self, user_outs = outs[len(fetch_loss):] user_fetch_list = fetch_list[len(fetch_loss):] for i, out in enumerate(user_outs): - train_logs["train_" + - fetch_map[user_fetch_list[i]]] = out[0] + train_logs["train_" + fetch_map[user_fetch_list[i]]] = out self._logger.info(train_logs) def evaluate(self, @@ -307,9 +298,9 @@ def evaluate(self, "eval model is not ready, please call `engine.prepare()` first." eval_dataloader = self._create_dataloader(eval_data, batch_size) - usr_fetch = self._to_map_fetch(fetches) - fetch_loss = self._inner_fetch(self.fetch_vars["loss"]) - fetch_metrics = self._inner_fetch(self.fetch_vars["metrics"]) + usr_fetch = self._validate_fetches(fetches) + fetch_loss = self._validate_fetches(self.fetch_vars["loss"]) + fetch_metrics = self._validate_fetches(self.fetch_vars["metrics"]) inner_fetch = dict(fetch_loss, **fetch_metrics) fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch) @@ -321,7 +312,7 @@ def evaluate(self, return_numpy=return_numpy) # inner fetches if fetch_loss: - eval_logs["eval_loss"] = outs[0] + eval_logs["eval_loss"] = outs[0][0] # Metric if fetch_metrics: metric_out = outs[len(fetch_loss):len(inner_fetch)] @@ -331,9 +322,9 @@ def evaluate(self, for i, res in enumerate(to_list(results)): eval_logs["eval_" + metric.name()[i]] = res # usr fetches - usr_out = outs[len(inner_fetch):] + usr_outs = outs[len(inner_fetch):] usr_fetch_list = fetch_list[len(inner_fetch):] - for i, out in enumerate(usr_out): + for i, out in enumerate(usr_outs): eval_logs["eval_" + fetch_map[usr_fetch_list[i]]] = out # logger self._logger.info(eval_logs) @@ -349,8 +340,8 @@ def predict(self, "predict model is not ready, please call `engine.prepare()` first." test_dataloader = self._create_dataloader(test_data, batch_size) - usr_fetch = self._to_map_fetch(fetches) - fetch_outputs = self._inner_fetch(self.fetch_vars["outputs"]) + usr_fetch = self._validate_fetches(fetches) + fetch_outputs = self._validate_fetches(self.fetch_vars["outputs"]) fetch_list, fetch_map = self._fetch_map(fetch_outputs, usr_fetch) outputs = [] @@ -362,42 +353,11 @@ def predict(self, return_numpy=return_numpy) outputs.append(outs[:len(fetch_outputs)]) for i, out in enumerate(outs): - predict_logs["pred_" + fetch_map[fetch_list[i]]] = out[0] + predict_logs["pred_" + fetch_map[fetch_list[i]]] = out self._logger.info(predict_logs) return outputs - def _local_var(self, var): - var_name = _to_name_str(var) - return var_name in self.main_program.global_block().vars - - def _to_map_fetch(self, fetches): - if not fetches: - return {} - if isinstance(fetches, dict): - fetch_var_names = list(map(_to_name_str, fetches.values())) - usr_fetches = dict(zip(fetch_var_names, list(fetches.keys()))) - elif isinstance(fetches, list): - fetch_var_names = list(map(_to_name_str, fetches)) - usr_fetches = dict(zip(fetch_var_names, fetch_var_names)) - return dict(filter(lambda x: self._local_var(x[0]), - usr_fetches.items())) - - def _inner_fetch(self, fetch_vars): - fetch_list = list( - map(lambda x: x.name, list(filter(self._local_var, fetch_vars)))) - inner_fetches = dict(zip(fetch_list, fetch_list)) - return inner_fetches - - def _fetch_map(self, inner_fetch, usr_fetch): - # replace inner fetch name if usr set for it - for iname in inner_fetch: - if iname in usr_fetch: - inner_fetch[iname] = usr_fetch[iname] - usr_fetch.pop(iname) - fetches = dict(inner_fetch, **usr_fetch) - return list(fetches.keys()), fetches - def _create_dataloader(self, dataset, batch_size, @@ -468,26 +428,35 @@ def _validate_spec(self, specs): .format(i, spec)) return specs - def _set_data_parallel(self, var): - if self._nranks == 1: - self._default_strategy = 'serial' - auto.shard_tensor(var, - dist_attr={ - "process_mesh": [0], - "dims_mapping": - [-1 for _ in range(len(var.shape))] - }) + def _is_local_var(self, var): + var_name = _to_name_str(var) + return var_name in self.main_program.global_block().vars + + def _validate_fetches(self, fetches): + # 1. Check user-defined fetches type + # 2. Prepare fetches_dict like {user_defined_name: var_name} + if not fetches: + return {} + if isinstance(fetches, dict): + fetch_var_names = list(map(_to_name_str, fetches.values())) + fetches_dict = dict(zip(fetch_var_names, list(fetches.keys()))) + elif isinstance(fetches, list): + fetch_var_names = list(map(_to_name_str, fetches)) + fetches_dict = dict(zip(fetch_var_names, fetch_var_names)) else: - self._default_strategy = 'dp' - auto.shard_tensor(var, - dist_attr={ - "process_mesh": - list(range(self._nranks)), - "dims_mapping": - [0] + [-1 for _ in range(len(var.shape) - 1)] - }) - - return var + raise TypeError("'fetches' only support 'dict' and 'list', " + "but got '{}'".format(str(type(fetches)))) + return dict( + filter(lambda x: self._is_local_var(x[0]), fetches_dict.items())) + + def _fetch_map(self, inner_fetch, usr_fetch): + # replace inner fetch name if usr set for it + for iname in inner_fetch: + if iname in usr_fetch: + inner_fetch[iname] = usr_fetch[iname] + usr_fetch.pop(iname) + fetches = dict(inner_fetch, **usr_fetch) + return list(fetches.keys()), fetches def _get_data_parallel_info(self, var, dist_context): # get data parallel world size and current data parallel rank diff --git a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py index 0a4bfb1213d46..b00f1a589e312 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py @@ -137,7 +137,7 @@ def backward(ctx, *args, **kwargs): attrs={ "in_dtype": inf_var.dtype, "out_dtype": inf_var_int32.dtype, - OP_ROLE_KEY: OpRole.Backward + OP_ROLE_KEY: OpRole.Optimize }) allreduce_op = main_block.append_op(type='c_allreduce_max', inputs={'X': inf_var_int32}, @@ -145,7 +145,7 @@ def backward(ctx, *args, **kwargs): attrs={ 'ring_id': group.id, 'use_calc_stream': True, - OP_ROLE_KEY: OpRole.Backward + OP_ROLE_KEY: OpRole.Optimize }) cast_op2 = main_block.append_op(type='cast', inputs={'X': inf_var_int32}, @@ -153,7 +153,7 @@ def backward(ctx, *args, **kwargs): attrs={ "in_dtype": inf_var_int32.dtype, "out_dtype": inf_var.dtype, - OP_ROLE_KEY: OpRole.Backward + OP_ROLE_KEY: OpRole.Optimize }) main_block._sync_with_cpp() diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py index 2272400e60ddf..80c9b8641ba36 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py @@ -222,7 +222,10 @@ def forward(ctx, *args, **kwargs): 'W': [Weight_var] }, outputs={'Out': [intermediate_var_0]}, - attrs={"start_index": relative_idx}) + attrs={ + "start_index": relative_idx, + OP_ROLE_KEY: src_op.attr('op_role') + }) if intermediate_var_0.shape != ref_shape: intermediate_var_0.desc.set_shape(ref_shape) @@ -235,6 +238,7 @@ def forward(ctx, *args, **kwargs): 'ring_id': group.id, 'use_calc_stream': True, 'use_model_parallel': True, + OP_ROLE_KEY: src_op.attr('op_role') }) if Out_var.shape != ref_shape: Out_var.desc.set_shape(ref_shape) @@ -442,6 +446,7 @@ def backward(ctx, *args, **kwargs): dp_group = new_process_group(group_ranks) if need_gradient_allreduce: + added_ops = [] W_Grad_var = main_block.var(kwargs['W@GRAD'][0]) allreduce_op = main_block.append_op(type='c_allreduce_sum', inputs={'X': [W_Grad_var]}, @@ -451,19 +456,24 @@ def backward(ctx, *args, **kwargs): 'use_calc_stream': True, OP_ROLE_KEY: OpRole.Backward }) - scale_op = main_block.append_op(type='scale', - inputs={'X': W_Grad_var}, - outputs={'Out': W_Grad_var}, - attrs={ - 'scale': 1.0 / dp_degree, - OP_ROLE_KEY: OpRole.Backward - }) + added_ops.append(allreduce_op) + + if ctx.gradient_scale: + scale_op = main_block.append_op(type='scale', + inputs={'X': W_Grad_var}, + outputs={'Out': W_Grad_var}, + attrs={ + 'scale': 1.0 / dp_degree, + OP_ROLE_KEY: OpRole.Backward + }) + added_ops.append(scale_op) + main_block._sync_with_cpp() dims_mapping = ctx.get_tensor_dist_attr_for_program( W_Grad_var).dims_mapping process_mesh = dist_attr.process_mesh - for op in [allreduce_op, scale_op]: + for op in added_ops: op_attr = OperatorDistributedAttribute() op_attr.process_mesh = process_mesh op_attr.set_output_dims_mapping(W_Grad_var.name, dims_mapping) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 427932a77fbcd..0826148208ec0 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -405,6 +405,7 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs): dp_group = new_process_group(group_ranks) if need_gradient_allreduce and is_parameter_related(Y_var.name, main_block): + added_ops = [] Y_Grad_var = main_block.var(kwargs['Y@GRAD'][0]) allreduce_op = main_block.append_op(type='c_allreduce_sum', inputs={'X': [Y_Grad_var]}, @@ -414,19 +415,24 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs): 'use_calc_stream': True, OP_ROLE_KEY: OpRole.Backward }) - scale_op = main_block.append_op(type='scale', - inputs={'X': Y_Grad_var}, - outputs={'Out': Y_Grad_var}, - attrs={ - 'scale': 1.0 / dp_degree, - OP_ROLE_KEY: OpRole.Backward - }) + added_ops.append(allreduce_op) + + if ctx.gradient_scale: + scale_op = main_block.append_op(type='scale', + inputs={'X': Y_Grad_var}, + outputs={'Out': Y_Grad_var}, + attrs={ + 'scale': 1.0 / dp_degree, + OP_ROLE_KEY: OpRole.Backward + }) + added_ops.append(scale_op) + main_block._sync_with_cpp() dims_mapping = ctx.get_tensor_dist_attr_for_program( Y_Grad_var).dims_mapping process_mesh = dist_attr.process_mesh - for op in [allreduce_op, scale_op]: + for op in added_ops: op_attr = OperatorDistributedAttribute() op_attr.process_mesh = process_mesh op_attr.set_output_dims_mapping(Y_Grad_var.name, dims_mapping) @@ -617,6 +623,7 @@ def forward(ctx, *args, **kwargs): 'ring_id': group.id, 'use_calc_stream': True, 'use_model_parallel': True, + OP_ROLE_KEY: src_op.attr('op_role') }) if intermediate_var_0.shape != ref_shape_x: intermediate_var_0.desc.set_shape(ref_shape_x) @@ -629,6 +636,7 @@ def forward(ctx, *args, **kwargs): 'transpose_X': False, 'transpose_Y': False, 'alpha': 1, + OP_ROLE_KEY: src_op('op_role') } inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} matmul_op = main_block.append_op(type='matmul', @@ -814,6 +822,7 @@ def forward(ctx, *args, **kwargs): 'transpose_X': False, 'transpose_Y': False, 'alpha': 1, + OP_ROLE_KEY: src_op.attr('op_role') } inputs = {'X': X_var, 'Y': Weight_var} @@ -853,7 +862,8 @@ def forward(ctx, *args, **kwargs): attrs={ 'ring_id': group.id, 'use_calc_stream': True, - 'use_model_parallel': True + 'use_model_parallel': True, + OP_ROLE_KEY: src_op.attr('op_role') }) if Out_var.shape != ref_shape: Out_var.desc.set_shape(ref_shape) @@ -1137,6 +1147,7 @@ def forward(ctx, *args, **kwargs): 'ring_id': group.id, 'use_calc_stream': True, 'use_model_parallel': True, + OP_ROLE_KEY: src_op.attr('op_role'), }) if intermediate_var_0.shape != ref_shape_x: intermediate_var_0.desc.set_shape(ref_shape_x) @@ -1145,7 +1156,11 @@ def forward(ctx, *args, **kwargs): ['float16', 'float32', 'float64'], 'linear') check_dtype(intermediate_var_0.dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear') - attrs = {'trans_x': False, 'trans_y': False} + attrs = { + 'trans_x': False, + 'trans_y': False, + OP_ROLE_KEY: src_op.attr('op_role') + } inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} matmul_v2_op = main_block.append_op(type='matmul_v2', inputs=inputs, @@ -1322,7 +1337,11 @@ def forward(ctx, *args, **kwargs): 'linear') check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear') - attrs = {'trans_x': False, 'trans_y': False} + attrs = { + 'trans_x': False, + 'trans_y': False, + OP_ROLE_KEY: src_op.attr('op_role') + } inputs = {'X': X_var, 'Y': Weight_var} # infer out var shape with op dist attr @@ -1361,7 +1380,8 @@ def forward(ctx, *args, **kwargs): attrs={ 'ring_id': group.id, 'use_calc_stream': True, - 'use_model_parallel': True + 'use_model_parallel': True, + OP_ROLE_KEY: src_op.attr('op_role') }) if Out_var.shape != ref_shape: Out_var.desc.set_shape(ref_shape) @@ -1646,6 +1666,7 @@ def forward(ctx, *args, **kwargs): 'ring_id': group.id, 'use_calc_stream': True, 'use_model_parallel': True, + OP_ROLE_KEY: src_op.attr('op_role') }) if intermediate_var_0.shape != ref_shape_x: intermediate_var_0.desc.set_shape(ref_shape_x) @@ -1657,7 +1678,8 @@ def forward(ctx, *args, **kwargs): # attrs = {'trans_x': False, 'trans_y': False} attrs = { "x_num_col_dims": src_op.desc.attr("x_num_col_dims"), - "y_num_col_dims": src_op.desc.attr("y_num_col_dims") + "y_num_col_dims": src_op.desc.attr("y_num_col_dims"), + OP_ROLE_KEY: src_op.attr('op_role') } inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]} mul_op = main_block.append_op(type='mul', @@ -1838,7 +1860,8 @@ def forward(ctx, *args, **kwargs): # attrs = {'trans_x': False, 'trans_y': False} attrs = { "x_num_col_dims": src_op.desc.attr("x_num_col_dims"), - "y_num_col_dims": src_op.desc.attr("y_num_col_dims") + "y_num_col_dims": src_op.desc.attr("y_num_col_dims"), + OP_ROLE_KEY: src_op.attr('op_role') } inputs = {'X': X_var, 'Y': Weight_var} @@ -1878,7 +1901,8 @@ def forward(ctx, *args, **kwargs): attrs={ 'ring_id': group.id, 'use_calc_stream': True, - 'use_model_parallel': True + 'use_model_parallel': True, + OP_ROLE_KEY: src_op.attr('op_role') }) if Out_var.shape != ref_shape: Out_var.desc.set_shape(ref_shape) diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py index 9056ab34fa711..97ff881ef95bf 100644 --- a/python/paddle/distributed/auto_parallel/partitioner.py +++ b/python/paddle/distributed/auto_parallel/partitioner.py @@ -264,10 +264,12 @@ def partition_block(self, ref_block, target_block): self._dist_context, **kinputs, **koutputs, **{"grad_var_to_var": grad_var_to_var}) elif is_optimize_op(op): + # NOTE: BACKWARD_ONLY_DIST_OPS's op_role must 2 because of 1F1B PASS kinputs, koutputs = dist_op_context.prepare_context(op) - dist_op_impl = get_distributed_operator_impl_container( - "default").get_impl(0) - dist_op_impl.backward(self._dist_context, **kinputs, **koutputs) + dist_op_opt_impl = _get_dist_op_backward_implement( + op, self._dist_context, forward_op_id2forward_op) + dist_op_opt_impl.backward(self._dist_context, **kinputs, + **koutputs) else: raise NotImplementedError( "partitioner only support forward and backward, optimize ops, but got {}" diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py index e220b654e700a..c4f9ad8b6bc84 100644 --- a/python/paddle/distributed/auto_parallel/utils.py +++ b/python/paddle/distributed/auto_parallel/utils.py @@ -1065,7 +1065,7 @@ def set_grad_var_shape(program, dist_context): "softmax", "cross_entropy2", "dropout", "tanh", ["slice_grad", "c_allgather"], "assign", "matmul_v2_grad_grad", "elementwise_add_grad_grad", "shape", "sqrt", - "fused_softmax_mask_upper_triangle_grad" + "fused_softmax_mask_upper_triangle" ] if op.type in need_set_shape_list: for forward_op in block.ops: @@ -1096,11 +1096,9 @@ def set_grad_var_shape(program, dist_context): def is_forward_op(op): - ref_role1 = int(core.op_proto_and_checker_maker.OpRole.Forward) - ref_role2 = int(core.op_proto_and_checker_maker.OpRole.Loss) op_role = int(op.attr('op_role')) - return OP_ROLE_KEY in op.attr_names and (op_role == ref_role1 - or op_role == ref_role2) + return OP_ROLE_KEY in op.attr_names and (op_role == int(OpRole.Forward) + or op_role == int(OpRole.Loss)) def is_backward_op(op): @@ -1113,9 +1111,14 @@ def is_optimize_op(op): int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize) +def is_lr_sched_op(op): + return OP_ROLE_KEY in op.attr_names and \ + int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize.LRSched) + + def is_loss_op(op): return OP_ROLE_KEY in op.attr_names and \ - int(op.all_attrs()[OP_ROLE_KEY]) == (int(core.op_proto_and_checker_maker.OpRole.Forward) | int(core.op_proto_and_checker_maker.OpRole.Loss)) + int(op.all_attrs()[OP_ROLE_KEY]) == (int(OpRole.Forward) | int(OpRole.Loss)) def is_prim_op(op): diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py index f0d02451141ae..7afba8c0f1377 100644 --- a/python/paddle/distributed/passes/auto_parallel_amp.py +++ b/python/paddle/distributed/passes/auto_parallel_amp.py @@ -452,7 +452,7 @@ def _check_and_update_gradient(params_grads, loss_scaling, dist_context): inputs = {'X': grads, 'Scale': loss_scaling} outputs = {'Out': grads, 'FoundInfinite': found_inf} - attrs = {'op_role': OpRole.Backward} + attrs = {'op_role': OpRole.Optimize} new_op = main_block.append_op(type='check_finite_and_unscale', inputs=inputs, outputs=outputs, @@ -732,7 +732,7 @@ def _update_loss_scaling(self, grads, found_inf): 'incr_ratio': self.get_attr("incr_ratio"), 'decr_ratio': self.get_attr("decr_ratio"), 'stop_update': self.get_attr("stop_update"), - 'op_role': OpRole.Backward + 'op_role': OpRole.Optimize } new_op = main_block.append_op(type='update_loss_scaling', diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py index 394d71706c4c4..66cce97533efc 100644 --- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py +++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py @@ -21,20 +21,13 @@ from paddle.fluid import layers from paddle.fluid.framework import program_guard, device_guard from .pass_base import PassBase, PassType, register_pass -from paddle.distributed.fleet.meta_optimizers.common import OpRole -from paddle.distributed.auto_parallel.utils import set_var_dist_attr +from paddle.distributed.auto_parallel.utils import set_var_dist_attr, is_optimize_op, OpRole, OP_ROLE_KEY from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping from paddle.distributed.auto_parallel.process_group import get_world_process_group world_process_group = get_world_process_group() -def _is_the_optimizer_op(op): - OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() - return OP_ROLE_KEY in op.attr_names and \ - int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize) - - def _remove_and_get_optimizer_op(main_program, dist_context): # 1 create tmp block # 2 mv optimizer op from global program to tmp block @@ -43,9 +36,8 @@ def _remove_and_get_optimizer_op(main_program, dist_context): temp_block = main_program._create_block() removed_op_idx = [] optimize_ops_desc = [] - skip_ops = ["increment", "elementwise_mod", "equal"] for idx, op in enumerate(main_block.ops): - if _is_the_optimizer_op(op) and op.type not in skip_ops: + if is_optimize_op(op): # append optimizer op to tmp block new_op_desc = temp_block.desc.append_op() new_op_desc.copy_from(op.desc) @@ -57,7 +49,8 @@ def _remove_and_get_optimizer_op(main_program, dist_context): dist_context.del_dist_op_for_program(op) for idx in removed_op_idx[::-1]: - main_block._remove_op(idx) + main_block._remove_op(idx, sync=False) + main_block._sync_with_cpp() return optimize_ops_desc @@ -109,7 +102,7 @@ def _get_gm_cond_var(main_program, k_steps, dist_context): outputs={'Out': [step_var]}, attrs={ 'step': float(1.0), - 'op_role': OpRole.Optimize + OP_ROLE_KEY: OpRole.Backward }) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( increment_op, world_process_group.ranks, [-1], dist_context) @@ -123,7 +116,8 @@ def _get_gm_cond_var(main_program, k_steps, dist_context): attrs={ 'axis': -1, 'use_mkldnn': False, - 'op_role': OpRole.Optimize + OP_ROLE_KEY: + OpRole.Backward }) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( elementwise_mod_op, world_process_group.ranks, [-1], dist_context) @@ -134,7 +128,7 @@ def _get_gm_cond_var(main_program, k_steps, dist_context): 'Y': zero_var }, outputs={'Out': cond_var}, - attrs={'op_role': OpRole.Optimize}) + attrs={OP_ROLE_KEY: OpRole.Backward}) naive_set_dist_op_attr_for_program_by_mesh_and_mapping( equal_op, world_process_group.ranks, [-1], dist_context) @@ -143,7 +137,6 @@ def _get_gm_cond_var(main_program, k_steps, dist_context): def _append_gradient_merge_backward_op( main_program, startup_program, params_grads: List[Tuple[Any, Any]], - cond_var_name: str, dist_context) -> Tuple[List[Tuple[Any, Any]], Dict[str, Any]]: main_block = main_program.global_block() startup_block = startup_program.global_block() @@ -201,7 +194,7 @@ def _append_gradient_merge_backward_op( attrs={ 'axis': -1, 'use_mkldnn': False, - 'op_role': OpRole.Optimize + OP_ROLE_KEY: OpRole.Backward }) new_params_to_grads.append([param, gradient_merge_var]) grad_to_gradient_merge[grad.name] = gradient_merge_var.name @@ -233,8 +226,7 @@ def true_apply_gradient(): 'bias': 0.0, 'bias_after_scale': False }) - new_grad.op._set_attr(op_maker.kOpRoleAttrName(), - OpRole.Optimize) + new_grad.op._set_attr(OP_ROLE_KEY, OpRole.Optimize) # append optimizer ops for op_desc in optimize_ops_desc: @@ -272,29 +264,27 @@ def true_apply_gradient(): dtype=new_grad.dtype, value=0.0, out=new_grad) - new_grad.op._set_attr(op_maker.kOpRoleAttrName(), - op_maker.OpRole.Optimize) + new_grad.op._set_attr(OP_ROLE_KEY, op_maker.OpRole.Optimize) layers.cond(cond_var, true_fn=true_apply_gradient, false_fn=None) cond_op = main_program.global_block().ops[-1] - cond_op._set_attr('op_role', OpRole.Optimize) + cond_op._set_attr(OP_ROLE_KEY, OpRole.Optimize) def parse_program(main_program, startup_program, params_grads, k_steps, avg, dist_context): - # 1 create gradient_merge_cond - cond_var = _get_gm_cond_var(main_program, k_steps, dist_context) - - # 2 remove optimizer_op from main_program + # 1 remove optimizer_op from main_program optimize_ops_desc = _remove_and_get_optimizer_op(main_program, dist_context) # back to block 0 main_program._rollback() - # 3 append gradient merge backward op to main_program + # 2 append gradient merge backward op to main_program new_params_to_grads, grad_to_gradient_merge = _append_gradient_merge_backward_op( - main_program, startup_program, params_grads, cond_var.name, - dist_context) + main_program, startup_program, params_grads, dist_context) + + # 3 create gradient_merge_cond + cond_var = _get_gm_cond_var(main_program, k_steps, dist_context) # 4 create ConditionalBlock and append gradient merge optimizer ops _create_cond_block_and_update_optimizer(main_program, cond_var, From e35f0628e6bb854bb75228b5a0e09df4a6bb61de Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 7 Jul 2022 16:39:55 +0800 Subject: [PATCH 094/250] recover coverage build size check (#44153) --- paddle/scripts/paddle_build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 8577b8eb37efa..4e6e5eeee4770 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -3454,6 +3454,7 @@ function main() { check_diff_file_for_coverage cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} enable_unused_var_check + check_coverage_build ;; gpu_cicheck_coverage) parallel_test From 9428c969d480cce41aea9b946811621d55a40ec6 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 7 Jul 2022 17:01:52 +0800 Subject: [PATCH 095/250] Revert "make inference_c test linking only paddle_inference_c (#44126)" (#44149) This reverts commit bbe995556130f7cd77241df999b2eb0cebd4a146. --- paddle/fluid/inference/capi/CMakeLists.txt | 4 -- .../fluid/inference/capi_exp/CMakeLists.txt | 4 -- .../fluid/inference/tests/api/CMakeLists.txt | 60 ++++++++++++++++--- 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/inference/capi/CMakeLists.txt b/paddle/fluid/inference/capi/CMakeLists.txt index 73ba41607aae8..25d8a39dc6374 100644 --- a/paddle/fluid/inference/capi/CMakeLists.txt +++ b/paddle/fluid/inference/capi/CMakeLists.txt @@ -20,10 +20,6 @@ cc_library( SRCS ${C_API_SRCS} DEPS paddle_inference) -if(NOT ON_INFER) - return() -endif() - # Create inference capi shared library cc_library( paddle_inference_c_shared SHARED diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt index e35e14a0c0241..56de57cbb9c85 100644 --- a/paddle/fluid/inference/capi_exp/CMakeLists.txt +++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt @@ -20,10 +20,6 @@ cc_library( SRCS ${C_API_SRCS} DEPS paddle_inference) -if(NOT ON_INFER) - return() -endif() - # Create inference capi shared library cc_library( paddle_inference_c_shared SHARED diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 1ed41417355ce..610883ad1ad27 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -943,17 +943,28 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS analyzer_capi_exp_gpu_tester.cc EXTRA_DEPS - paddle_inference_c + ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + if(WIN32) + target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c_shared) + else() + target_link_libraries(test_analyzer_capi_exp_gpu paddle_inference_c) + endif() inference_analysis_test( test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc EXTRA_DEPS - paddle_inference_c + ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + if(WIN32) + target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c_shared) + else() + target_link_libraries(test_analyzer_capi_exp_xpu paddle_inference_c) + endif() + set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") if(NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz) @@ -1099,27 +1110,44 @@ inference_analysis_test( SRCS analyzer_capi_exp_tester.cc EXTRA_DEPS - paddle_inference_c + ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_MODEL_DIR}/model) +if(WIN32) + target_link_libraries(test_analyzer_capi_exp paddle_inference_c_shared) +else() + target_link_libraries(test_analyzer_capi_exp paddle_inference_c) +endif() inference_analysis_test( test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc EXTRA_DEPS - paddle_inference_c + ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) +if(WIN32) + target_link_libraries(test_analyzer_capi_exp_pd_config + paddle_inference_c_shared) +else() + target_link_libraries(test_analyzer_capi_exp_pd_config paddle_inference_c) +endif() inference_analysis_test( test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc EXTRA_DEPS - paddle_inference_c + ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) +if(WIN32) + target_link_libraries(test_analyzer_capi_exp_pd_tensor + paddle_inference_c_shared) +else() + target_link_libraries(test_analyzer_capi_exp_pd_tensor paddle_inference_c) +endif() if(NOT APPLE AND NOT WIN32) inference_analysis_test( @@ -1127,9 +1155,15 @@ if(NOT APPLE AND NOT WIN32) SRCS analyzer_capi_exp_pd_threads_tester.cc EXTRA_DEPS - paddle_inference_c + ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) + if(WIN32) + target_link_libraries(test_analyzer_capi_exp_pd_threads + paddle_inference_c_shared) + else() + target_link_libraries(test_analyzer_capi_exp_pd_threads paddle_inference_c) + endif() endif() inference_analysis_test( @@ -1171,9 +1205,14 @@ if(WITH_MKLDNN) SRCS analyzer_capi_exp_int_tester.cc EXTRA_DEPS - paddle_inference_c + ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model) + if(WIN32) + target_link_libraries(test_analyzer_capi_exp_int paddle_inference_c_shared) + else() + target_link_libraries(test_analyzer_capi_exp_int paddle_inference_c) + endif() endif() inference_analysis_test( @@ -1181,9 +1220,14 @@ inference_analysis_test( SRCS analyzer_capi_exp_ner_tester.cc EXTRA_DEPS - paddle_inference_c + ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model) +if(WIN32) + target_link_libraries(test_analyzer_capi_exp_ner paddle_inference_c_shared) +else() + target_link_libraries(test_analyzer_capi_exp_ner paddle_inference_c) +endif() if(WITH_GPU) inference_analysis_test( From 33540e109a742c42b8273e389a1ca1c89596869d Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Thu, 7 Jul 2022 17:38:04 +0800 Subject: [PATCH 096/250] Fix nan in fast_ln_fwd_kernel when cols > 1024 (#44125) * Fix nan in fast_ln_fwd_kernel when cols > 1024 * delete blas --- .../operators/fused/fused_layernorm_residual_dropout_bias.h | 4 ++-- paddle/fluid/operators/layer_norm_kernel.cu.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index 4aedf4eb79bd1..301b62524a54d 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -573,7 +573,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( smem[warp_m * WARPS_N + warp_n] = mu_local; } __syncthreads(); - if (tidx == 0) { + if (tidx % THREADS_PER_ROW == 0) { mu_local = 0.f; #pragma unroll for (int it = 0; it < WARPS_N; ++it) { @@ -608,7 +608,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel( smem[warp_m * WARPS_N + warp_n] = var_local; } __syncthreads(); - if (tidx == 0) { + if (tidx % THREADS_PER_ROW == 0) { var_local = 0.f; #pragma unroll for (int it = 0; it < WARPS_N; ++it) { diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index e37f048235e7c..8ed706a5443af 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -252,7 +252,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel( smem[warp_m * WARPS_N + warp_n] = mu_local; } __syncthreads(); - if (tidx == 0) { + if (tidx % THREADS_PER_ROW == 0) { mu_local = 0.f; #pragma unroll for (int it = 0; it < WARPS_N; ++it) { @@ -289,7 +289,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fast_ln_fwd_kernel( smem[warp_m * WARPS_N + warp_n] = var_local; } __syncthreads(); - if (tidx == 0) { + if (tidx % THREADS_PER_ROW == 0) { var_local = 0.f; #pragma unroll for (int it = 0; it < WARPS_N; ++it) { From d752a7f2fcbacb864da31da8d157fbff886eb79d Mon Sep 17 00:00:00 2001 From: taixiurong Date: Thu, 7 Jul 2022 19:10:01 +0800 Subject: [PATCH 097/250] =?UTF-8?q?xpu-paddlepaddle-31=20=E4=BC=98?= =?UTF-8?q?=E5=8C=96matmul=20test=3Dkunlun=20(#43975)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/fluid/operators/matmul_op_xpu.cc | 411 +++---------- paddle/fluid/operators/matmul_v2_op_xpu.cc | 323 ++-------- paddle/fluid/operators/mul_op_xpu.cc | 188 ++---- paddle/fluid/operators/xpu_api_wrapper.h | 558 +++++++++++++++++- .../fluid/platform/device/xpu/xpu2_op_list.h | 15 +- .../unittests/xpu/get_test_cover_info.py | 4 +- .../tests/unittests/xpu/test_matmul_op_xpu.py | 3 +- 7 files changed, 735 insertions(+), 767 deletions(-) diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc index 945546a502aeb..efad516cdbfe5 100644 --- a/paddle/fluid/operators/matmul_op_xpu.cc +++ b/paddle/fluid/operators/matmul_op_xpu.cc @@ -20,276 +20,40 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/xpu_api_wrapper.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { namespace operators { using framework::Tensor; -static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) { - if (x_dim.size() > 1) { - return x_dim; - } - return phi::make_ddim({1, x_dim[0]}); -} - -static framework::Tensor FoldInitDims(const framework::Tensor &input) { - auto output = input; - auto in_dims = input.dims(); - if (in_dims.size() == 3) { - output.Resize({in_dims[0] * in_dims[1], in_dims[2]}); - } - return output; -} -/** - * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the - * original y_dim is returned. - */ -static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) { - if (y_dim.size() > 1) { - return y_dim; - } - return phi::make_ddim({y_dim[0], 1}); -} - -static void ReshapeTensorIntoMatrixSequence( - framework::Tensor *x, const phi::funcs::MatDescriptor &descriptor) { - int64_t h, w; - h = descriptor.height_; - w = descriptor.width_; - if (descriptor.trans_) { - std::swap(w, h); - } - if (descriptor.batch_size_) { - x->Resize({descriptor.batch_size_, h, w}); - } else { - x->Resize({h, w}); - } -} -/** - * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor - * Out = matmul(x, y) - * - * This method will first calculate X,Y matrix sequence, and then calculate - * the out shape. - * - * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] - * The out = [BatchSize, H1, W2] - * - * If there is no batch size in `X` and `Y`, the out will be [H1, W2] - * If any of `X` and `Y` has batch size BatchSize, the out will have the - * BatchSize. - */ -static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x, - framework::Tensor *y, - framework::Tensor *out, - bool trans_x, - bool trans_y) { - auto x_dim = RowMatrixFromVector(x->dims()); - auto y_dim = ColumnMatrixFromVector(y->dims()); - auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x); - auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y); - if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { - out->Resize({mat_dim_x.height_, mat_dim_y.width_}); - } else { - out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), - mat_dim_x.height_, - mat_dim_y.width_}); - } - - ReshapeTensorIntoMatrixSequence(x, mat_dim_x); - ReshapeTensorIntoMatrixSequence(y, mat_dim_y); -} - -template -static void MatMulXPUFunction(const Tensor *x, - const Tensor *y, - Tensor *out, - bool trans_x, - bool trans_y, - const paddle::framework::ExecutionContext &ctx) { - using XPUType = typename XPUTypeTrait::Type; - const auto &x_dims = x->dims(); - const auto &y_dims = y->dims(); - auto &dev_ctx = - ctx.template device_context(); - - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor( - RowMatrixFromVector(x_dims), 0, trans_x); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor( - ColumnMatrixFromVector(y_dims), 0, trans_y); - - if (x_dims.size() == 3 && y_dims.size() <= 2) { - // if transpose_X is true, the transpose cost much time - if (!trans_x) { - mat_dim_a.height_ *= mat_dim_a.batch_size_; - mat_dim_a.batch_size_ = 0; - } else { - mat_dim_b.batch_size_ = mat_dim_a.batch_size_; - mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_; - } - } - - if (mat_dim_a.width_ == mat_dim_b.height_) { - if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) { - mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0; - } - if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) { - mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0; - } - } - - PADDLE_ENFORCE_EQ(mat_dim_a.width_, - mat_dim_b.height_, - platform::errors::InvalidArgument( - "Shape mistake in matmul_op, the " - "first tensor width must be same as " - "second tensor height, but received " - "width:%d, height:%d x_dims = %s , y_dims = %s", - mat_dim_a.width_, - mat_dim_b.height_, - x_dims.to_str().c_str(), - y_dims.to_str().c_str())); - PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, - mat_dim_b.batch_size_, - platform::errors::InvalidArgument( - "Shape mistake in matmul_op, the two input" - "tensor batch_size must be same, but received first " - "tensor batch_size:%d, second " - "tensor batch_size:%d, x_dims = %s , y_dims = %s", - mat_dim_a.batch_size_, - mat_dim_b.batch_size_, - x_dims.to_str().c_str(), - y_dims.to_str().c_str())); - - float alpha = static_cast(ctx.Attr("alpha")); - T *data_c = out->data(); - int m = mat_dim_a.height_; - int n = mat_dim_b.width_; - int k = mat_dim_a.width_; - int batch_size = mat_dim_a.batch_size_; - int ldx = mat_dim_a.trans_ ? m : k; - int ldy = mat_dim_b.trans_ ? k : n; - int ldout = n; - if (batch_size <= 1) { - int r = 0; - r = xpu_fc_wrapper( - dev_ctx.x_context(), - reinterpret_cast(x->data()), - reinterpret_cast(y->data()), - reinterpret_cast(data_c), - m, - n, - k, - mat_dim_a.trans_, - mat_dim_b.trans_, - nullptr, - nullptr, - nullptr, - ldx, - ldy, - ldout, - alpha, - 0, - nullptr, - xpu::Activation_t::LINEAR); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU fc kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); - } else { - // batch matmul - int r = xpu::fc_batched( - dev_ctx.x_context(), // Context* ctx, - batch_size, // int batch_size, - mat_dim_a.trans_, // bool x_trans, - mat_dim_b.trans_, // bool w_trans, - m, // int m, - n, // int n, - k, // int k, - alpha, // float alpha, - reinterpret_cast(x->data()), // const TX* x, - mat_dim_a.stride_, // int stride_a, - reinterpret_cast(y->data()), // const TW* w, - mat_dim_b.stride_, // int stride_b, - 0.0, // float beta, - reinterpret_cast(data_c), // TY* y, - m * n, // int stride_c, - nullptr, // const float* x_maxptr, - nullptr); // const float* w_maxptr - - PADDLE_ENFORCE_EQ(r, - XPU_SUCCESS, - platform::errors::External( - "XPU fc_batched kernel return wrong value[%d %s] " - "x_dims = %s , y_dims = %s", - r, - XPUAPIErrorMsg[r], - x_dims.to_str().c_str(), - y_dims.to_str().c_str())); - } -} - template class MatMulXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: - void Compute(const framework::ExecutionContext &context) const override { - auto *x = context.Input("X"); - auto *y = context.Input("Y"); - auto *out = context.Output("Out"); + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* out = context.Output("Out"); out->mutable_data(context.GetPlace()); bool trans_x = context.Attr("transpose_X"); bool trans_y = context.Attr("transpose_Y"); - if (std::is_same::value) { - MatMulXPUFunction(x, y, out, trans_x, trans_y, context); - } else { - if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) { - MatMulXPUFunction(x, y, out, trans_x, trans_y, context); - } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) { - MatMulXPUFunction(x, y, out, trans_x, trans_y, context); - } else { - MatMulXPUFunction(x, y, out, trans_x, trans_y, context); - } - } + float alpha = static_cast(context.Attr("alpha")); + const XPUType* x_ptr = reinterpret_cast(x->data()); + const XPUType* y_ptr = reinterpret_cast(y->data()); + XPUType* out_ptr = reinterpret_cast(out->data()); + auto x_dims = x->dims(); + auto y_dims = y->dims(); + + XpuFcInfo fc_info; + GetFCInfo(x_dims, y_dims, trans_x, trans_y, &fc_info); + auto& dev_ctx = + context.template device_context(); + xpu::Context* xpu_ctx = dev_ctx.x_context(); + + MatMulXPUFunction(xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, alpha); } }; -// Reshape a rank-3 tensor from P x M x N to M x (P * N). -// (Warning: This requires transposing data and writes into new memory.) -// Identity op if the tensor is not of rank 3. -template -static framework::Tensor XPUFoldHeadAndLastDims( - const DeviceContext &context, const framework::Tensor &input) { - using XPUType = typename XPUTypeTrait::Type; - auto in_dims = input.dims(); - if (in_dims.size() != 3) { - return input; - } - - framework::Tensor output; - output.Resize({in_dims[1], in_dims[0], in_dims[2]}); - output.mutable_data(context.GetPlace()); - std::vector in_shape_host = {static_cast(in_dims[0]), - static_cast(in_dims[1]), - static_cast(in_dims[2])}; - std::vector axis_host = {1, 0, 2}; - int r = xpu::transpose(context.x_context(), - reinterpret_cast(input.data()), - reinterpret_cast(output.data()), - in_shape_host, - axis_host); - PADDLE_ENFORCE_EQ(r, - XPU_SUCCESS, - platform::errors::External( - "XPU transpose kernel return wrong value[%d %s]", - r, - XPUAPIErrorMsg[r])); - output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); - - return output; -} - // Using dimensional constraints on matrix multiplication, it is // straight-forward to check the following table for when X and Y // are both matrices. @@ -317,107 +81,68 @@ static framework::Tensor XPUFoldHeadAndLastDims( // to X: (P * M) x K, dOut: (P * M) x N. template class MatMulGradXPUKernel : public framework::OpKernel { - public: - void MatMul(const framework::ExecutionContext &context, - const framework::Tensor &a, - bool trans_a, - const framework::Tensor &b, - bool trans_b, - framework::Tensor *out) const { - out->mutable_data(context.GetPlace()); - if (std::is_same::value) { - MatMulXPUFunction(&a, &b, out, trans_a, trans_b, context); - } else { - if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) { - MatMulXPUFunction(&a, &b, out, trans_a, trans_b, context); - } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) { - MatMulXPUFunction(&a, &b, out, trans_a, trans_b, context); - } else { - MatMulXPUFunction(&a, &b, out, trans_a, trans_b, context); - } - } - } - - void CalcInputGrad(const framework::ExecutionContext &context, - const framework::Tensor &a, - bool trans_a, - bool is_fold_init_dims_a, - const framework::Tensor &b, - bool trans_b, - bool is_fold_init_dims_b, - framework::Tensor *out) const { - if (out == nullptr) return; - bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) && - out->dims().size() == 2; - if (!need_combine) { - MatMul(context, a, trans_a, b, trans_b, out); - } else { - auto &dev_ctx = context.template device_context(); - MatMul(context, - is_fold_init_dims_a - ? FoldInitDims(a) - : XPUFoldHeadAndLastDims(dev_ctx, a), - trans_a, - is_fold_init_dims_b - ? FoldInitDims(b) - : XPUFoldHeadAndLastDims(dev_ctx, b), - trans_b, - out); - } - } + using XPUType = typename XPUTypeTrait::Type; - void Compute(const framework::ExecutionContext &context) const override { + public: + void Compute(const framework::ExecutionContext& context) const override { auto x = *context.Input("X"); auto y = *context.Input("Y"); auto dout = *context.Input(framework::GradVarName("Out")); - auto *dx = context.Output(framework::GradVarName("X")); - auto *dy = context.Output(framework::GradVarName("Y")); + auto* dx = context.Output(framework::GradVarName("X")); + auto* dy = context.Output(framework::GradVarName("Y")); bool transpose_x = context.Attr("transpose_X"); bool transpose_y = context.Attr("transpose_Y"); - - ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); - - framework::DDim dx_dims; + float alpha = static_cast(context.Attr("alpha")); if (dx) { - dx_dims = dx->dims(); - if (dx_dims != x.dims()) { - dx->Resize(x.dims()); - } + dx->mutable_data(context.GetPlace()); } - - framework::DDim dy_dims; if (dy) { - dy_dims = dy->dims(); - if (dy_dims != y.dims()) { - dy->Resize(y.dims()); - } - } - - if (transpose_x && transpose_y) { - CalcInputGrad(context, y, true, true, dout, true, false, dx); - CalcInputGrad(context, dout, true, true, x, true, false, dy); - } else if (transpose_x) { - CalcInputGrad(context, y, false, false, dout, true, false, dx); - CalcInputGrad(context, x, false, false, dout, false, true, dy); - } else if (transpose_y) { - CalcInputGrad(context, dout, false, false, y, false, true, dx); - CalcInputGrad(context, dout, true, true, x, false, true, dy); - } else { - CalcInputGrad(context, dout, false, false, y, true, false, dx); - CalcInputGrad(context, x, true, true, dout, false, true, dy); + dy->mutable_data(context.GetPlace()); } - + auto& dev_ctx = + context.template device_context(); + + const XPUType* dout_ptr = reinterpret_cast(dout.data()); + const XPUType* x_ptr = reinterpret_cast(x.data()); + const XPUType* y_ptr = reinterpret_cast(y.data()); + + xpu::Context* xpu_ctx = dev_ctx.x_context(); + + XpuFcInfo info_forward; + GetFCInfo(x.dims(), y.dims(), transpose_x, transpose_y, &info_forward); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + // begin calculate + const XPUType* a_1 = reinterpret_cast(NULL); + const XPUType* b_1 = reinterpret_cast(NULL); + const XPUType* a_2 = reinterpret_cast(NULL); + const XPUType* b_2 = reinterpret_cast(NULL); + XPUType* c_1 = (dx == NULL) ? reinterpret_cast(NULL) + : reinterpret_cast(dx->data()); + XPUType* c_2 = (dy == NULL) ? reinterpret_cast(NULL) + : reinterpret_cast(dy->data()); + XpuFcInfo info_dx; + XpuFcInfo info_dy; + std::tuple + fc_info = MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + info_forward, + transpose_x, + transpose_y, + x_ptr, + y_ptr, + dout_ptr); + std::tie(info_dx, info_dy, a_1, b_1, a_2, b_2) = fc_info; if (dx) { - if (dx_dims != x.dims()) { - dx->Resize(dx_dims); - } + MatMulXPUFunction(xpu_ctx, a_1, b_1, c_1, info_dx, alpha); } - if (dy) { - if (dy_dims != y.dims()) { - dy->Resize(dy_dims); - } + MatMulXPUFunction(xpu_ctx, a_2, b_2, c_2, info_dy, alpha); } } }; diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc index e9cb665f4fc0e..7b4195c1c19fa 100644 --- a/paddle/fluid/operators/matmul_v2_op_xpu.cc +++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc @@ -16,146 +16,17 @@ #include #include - #include "paddle/fluid/operators/matmul_v2_op.h" + #include "paddle/fluid/operators/xpu_api_wrapper.h" namespace paddle { namespace operators { -template -static void MatMulXPUFunction(const Tensor* x, - const Tensor* y, - Tensor* out, - bool trans_x, - bool trans_y, - const paddle::framework::ExecutionContext& ctx) { - using XPUType = typename XPUTypeTrait::Type; - const auto& x_dims = x->dims(); - const auto& y_dims = y->dims(); - auto& dev_ctx = - ctx.template device_context(); - - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor( - RowMatrixFromVector(x_dims), 0, trans_x); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor( - ColumnMatrixFromVector(y_dims), 0, trans_y); - - if (x_dims.size() >= 3 && y_dims.size() <= 2) { - // if transpose_X is true, the transpose cost much time - if (!trans_x) { - mat_dim_a.height_ *= mat_dim_a.batch_size_; - mat_dim_a.batch_size_ = 0; - } else { - mat_dim_b.batch_size_ = mat_dim_a.batch_size_; - mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_; - } - } - - if (mat_dim_a.width_ == mat_dim_b.height_) { - if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) { - mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0; - } - if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) { - mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0; - } - } - - PADDLE_ENFORCE_EQ(mat_dim_a.width_, - mat_dim_b.height_, - platform::errors::InvalidArgument( - "Shape mistake in matmul_v2_op xdims = %s ydims = %s " - "x_trans = %d y_trans = %d", - x_dims.to_str(), - y_dims.to_str(), - mat_dim_a.trans_, - mat_dim_b.trans_)); - PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, - mat_dim_b.batch_size_, - platform::errors::InvalidArgument( - "Shape mistake in matmul_v2_op xdims = %s ydims = %s " - "x_trans = %d y_trans = %d", - x_dims.to_str(), - y_dims.to_str(), - mat_dim_a.trans_, - mat_dim_b.trans_)); - - T* data_c = out->data(); - int m = mat_dim_a.height_; - int n = mat_dim_b.width_; - int k = mat_dim_a.width_; - int batch_size = mat_dim_a.batch_size_; - int ldx = mat_dim_a.trans_ ? m : k; - int ldy = mat_dim_b.trans_ ? k : n; - int ldout = n; - if (batch_size <= 1) { - int r = 0; - r = xpu_fc_wrapper( - dev_ctx.x_context(), - reinterpret_cast(x->data()), - reinterpret_cast(y->data()), - reinterpret_cast(data_c), - m, - n, - k, - mat_dim_a.trans_, - mat_dim_b.trans_, - nullptr, - nullptr, - nullptr, - ldx, - ldy, - ldout, - 1.0, - 0, - nullptr, - xpu::Activation_t::LINEAR); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU fc kernel return wrong value[%d %s] , m = %d, n = " - "%d, " - "k = %d, a_tr = %d, b_tr = %d", - r, - XPUAPIErrorMsg[r], - m, - n, - k, - mat_dim_a.trans_, - mat_dim_b.trans_)); - } else { - // batch matmul - int r = xpu::fc_batched( - dev_ctx.x_context(), // Context* ctx, - batch_size, // int batch_size, - mat_dim_a.trans_, // bool x_trans, - mat_dim_b.trans_, // bool w_trans, - m, // int m, - n, // int n, - k, // int k, - 1.0, // float alpha, - reinterpret_cast(x->data()), // const TX* x, - mat_dim_a.stride_, // int stride_a, - reinterpret_cast(y->data()), // const TW* w, - mat_dim_b.stride_, // int stride_b, - 0.0, // float beta, - reinterpret_cast(data_c), // TY* y, - m * n, // int stride_c, - nullptr, // const float* x_maxptr, - nullptr); // const float* w_maxptr - - PADDLE_ENFORCE_EQ(r, - XPU_SUCCESS, - platform::errors::External( - "XPU fc_batched kernel return wrong value[%d %s]", - r, - XPUAPIErrorMsg[r])); - } -} - template class MatMulV2XPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); @@ -164,160 +35,84 @@ class MatMulV2XPUKernel : public framework::OpKernel { bool trans_x = ctx.Attr("trans_x"); bool trans_y = ctx.Attr("trans_y"); out->mutable_data(ctx.GetPlace()); - if (std::is_same::value) { - MatMulXPUFunction(x, y, out, trans_x, trans_y, ctx); - } else { - if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) { - MatMulXPUFunction(x, y, out, trans_x, trans_y, ctx); - } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) { - MatMulXPUFunction(x, y, out, trans_x, trans_y, ctx); - } else { - MatMulXPUFunction(x, y, out, trans_x, trans_y, ctx); - } - } + const XPUType* x_ptr = reinterpret_cast(x->data()); + const XPUType* y_ptr = reinterpret_cast(y->data()); + XPUType* out_ptr = reinterpret_cast(out->data()); + auto x_dims = x->dims(); + auto y_dims = y->dims(); + + XpuFcInfo fc_info; + GetFCInfo(x_dims, y_dims, trans_x, trans_y, &fc_info); + auto& dev_ctx = + ctx.template device_context(); + xpu::Context* xpu_ctx = dev_ctx.x_context(); + MatMulXPUFunction(xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, 1.0f); } }; -template -static framework::Tensor XPUFoldHeadAndLastDims( - const DeviceContext& context, const framework::Tensor& input) { - using XPUType = typename XPUTypeTrait::Type; - auto in_dims = input.dims(); - if (in_dims.size() != 3) { - return input; - } - - framework::Tensor output; - output.Resize({in_dims[1], in_dims[0], in_dims[2]}); - output.mutable_data(context.GetPlace()); - std::vector in_shape_host = {static_cast(in_dims[0]), - static_cast(in_dims[1]), - static_cast(in_dims[2])}; - std::vector axis_host = {1, 0, 2}; - - int r = xpu::transpose(context.x_context(), - reinterpret_cast(input.data()), - reinterpret_cast(output.data()), - in_shape_host, - axis_host); - PADDLE_ENFORCE_EQ(r, - XPU_SUCCESS, - platform::errors::External( - "XPU transpose kernel return wrong value[%d %s]", - r, - XPUAPIErrorMsg[r])); - output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); - - return output; -} - template class MatMulV2XPUGradKernel : public framework::OpKernel { - public: - void MatMul(const framework::ExecutionContext& ctx, - const framework::Tensor& a, - bool trans_a, - const framework::Tensor& b, - bool trans_b, - framework::Tensor* out) const { - out->mutable_data(ctx.GetPlace()); - if (std::is_same::value) { - MatMulXPUFunction(&a, &b, out, trans_a, trans_b, ctx); - } else { - if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) { - MatMulXPUFunction(&a, &b, out, trans_a, trans_b, ctx); - } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) { - MatMulXPUFunction(&a, &b, out, trans_a, trans_b, ctx); - } else { - MatMulXPUFunction(&a, &b, out, trans_a, trans_b, ctx); - } - } - } - - void CalcInputGrad(const framework::ExecutionContext& context, - const framework::Tensor& a, - bool trans_a, - bool is_fold_init_dims_a, - const framework::Tensor& b, - bool trans_b, - bool is_fold_init_dims_b, - framework::Tensor* out) const { - if (out == nullptr) return; - bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) && - out->dims().size() == 2; - if (!need_combine) { - MatMul(context, a, trans_a, b, trans_b, out); - } else { - auto& dev_ctx = - context.template device_context(); - MatMul( - context, - is_fold_init_dims_a - ? FoldInitDims(a) - : XPUFoldHeadAndLastDims( - dev_ctx, a), - trans_a, - is_fold_init_dims_b - ? FoldInitDims(b) - : XPUFoldHeadAndLastDims( - dev_ctx, b), - trans_b, - out); - } - } + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { bool transpose_x = context.Attr("trans_x"); bool transpose_y = context.Attr("trans_y"); - auto x = *context.Input("X"); auto y = *context.Input("Y"); auto dout = *context.Input(framework::GradVarName("Out")); auto* dx = context.Output(framework::GradVarName("X")); auto* dy = context.Output(framework::GradVarName("Y")); - ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); - - framework::DDim dx_dims; if (dx) { - dx_dims = dx->dims(); - if (dx_dims != x.dims()) { - dx->Resize(x.dims()); - } + dx->mutable_data(context.GetPlace()); } - - framework::DDim dy_dims; if (dy) { - dy_dims = dy->dims(); - if (dy_dims != y.dims()) { - dy->Resize(y.dims()); - } - } - - if (transpose_x && transpose_y) { - CalcInputGrad(context, y, true, true, dout, true, false, dx); - CalcInputGrad(context, dout, true, true, x, true, false, dy); - } else if (transpose_x) { - CalcInputGrad(context, y, false, false, dout, true, false, dx); - CalcInputGrad(context, x, false, false, dout, false, true, dy); - } else if (transpose_y) { - CalcInputGrad(context, dout, false, false, y, false, true, dx); - CalcInputGrad(context, dout, true, true, x, false, true, dy); - } else { - CalcInputGrad(context, dout, false, false, y, true, false, dx); - CalcInputGrad(context, x, true, true, dout, false, true, dy); + dy->mutable_data(context.GetPlace()); } - + auto& dev_ctx = + context.template device_context(); + + const XPUType* dout_ptr = reinterpret_cast(dout.data()); + const XPUType* x_ptr = reinterpret_cast(x.data()); + const XPUType* y_ptr = reinterpret_cast(y.data()); + + xpu::Context* xpu_ctx = dev_ctx.x_context(); + + XpuFcInfo info_forward; + GetFCInfo(x.dims(), y.dims(), transpose_x, transpose_y, &info_forward); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + // begin calculate + const XPUType* a_1 = reinterpret_cast(NULL); + const XPUType* b_1 = reinterpret_cast(NULL); + const XPUType* a_2 = reinterpret_cast(NULL); + const XPUType* b_2 = reinterpret_cast(NULL); + XPUType* c_1 = (dx == NULL) ? reinterpret_cast(NULL) + : reinterpret_cast(dx->data()); + XPUType* c_2 = (dy == NULL) ? reinterpret_cast(NULL) + : reinterpret_cast(dy->data()); + XpuFcInfo info_dx; + XpuFcInfo info_dy; + std::tuple + fc_info = MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + info_forward, + transpose_x, + transpose_y, + x_ptr, + y_ptr, + dout_ptr); + std::tie(info_dx, info_dy, a_1, b_1, a_2, b_2) = fc_info; if (dx) { - if (dx_dims != x.dims()) { - dx->Resize(dx_dims); - } + MatMulXPUFunction(xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f); } - if (dy) { - if (dy_dims != y.dims()) { - dy->Resize(dy_dims); - } + MatMulXPUFunction(xpu_ctx, a_2, b_2, c_2, info_dy, 1.0f); } } }; diff --git a/paddle/fluid/operators/mul_op_xpu.cc b/paddle/fluid/operators/mul_op_xpu.cc index 706af96d1a6c4..727a7c0f6e52c 100644 --- a/paddle/fluid/operators/mul_op_xpu.cc +++ b/paddle/fluid/operators/mul_op_xpu.cc @@ -49,50 +49,23 @@ class MulXPUKernel : public framework::OpKernel { *y, context.template Attr("y_num_col_dims")) : *y; z->mutable_data(context.GetPlace()); - auto z_dim = z->dims(); - if (z_dim.size() != 2) { - z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); - } + + const XPUType* x_ptr = reinterpret_cast(x_matrix.data()); + const XPUType* y_ptr = reinterpret_cast(y_matrix.data()); + XPUType* out_ptr = reinterpret_cast(z->data()); + bool trans_a = false; bool trans_b = false; - int m = x_matrix.dims()[0]; - int k = x_matrix.dims()[1]; - int k1 = y_matrix.dims()[0]; - int n = y_matrix.dims()[1]; - PADDLE_ENFORCE_EQ( - k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op")); - T alpha = static_cast(1.0); - T beta = static_cast(0.0); - const T* data_a = x_matrix.data(); - const T* data_b = y_matrix.data(); - T* data_c = z->data(); - auto& dev_ctx = context.template device_context(); - - int ret = xpu_fc_wrapper( - dev_ctx.x_context(), - reinterpret_cast(data_a), - reinterpret_cast(data_b), - reinterpret_cast(data_c), - m, - n, - k, - trans_a, - trans_b, - nullptr, - nullptr, - nullptr, - k, - n, - n, - alpha, - beta, - nullptr, - xpu::Activation_t::LINEAR); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper"); - - if (z_dim.size() != 2) { - z->Resize(z_dim); - } + auto x_dims = x_matrix.dims(); + auto y_dims = y_matrix.dims(); + + XpuFcInfo fc_info; + GetFCInfo(x_dims, y_dims, trans_a, trans_b, &fc_info); + auto& dev_ctx = + context.template device_context(); + xpu::Context* xpu_ctx = dev_ctx.x_context(); + + MatMulXPUFunction(xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, 1.0f); } }; @@ -125,98 +98,51 @@ class MulGradXPUKernel : public framework::OpKernel { dy->set_lod(y->lod()); } auto& dev_ctx = ctx.template device_context(); + + XpuFcInfo info_forward; + GetFCInfo(x_matrix.dims(), y_matrix.dims(), false, false, &info_forward); + + const XPUType* dout_ptr = reinterpret_cast(dout->data()); + const XPUType* x_ptr = reinterpret_cast(x->data()); + const XPUType* y_ptr = reinterpret_cast(y->data()); + + xpu::Context* xpu_ctx = dev_ctx.x_context(); + xpu::ctx_guard RAII_GUARD(xpu_ctx); + // begin calculate + const XPUType* a_1 = reinterpret_cast(NULL); + const XPUType* b_1 = reinterpret_cast(NULL); + const XPUType* a_2 = reinterpret_cast(NULL); + const XPUType* b_2 = reinterpret_cast(NULL); + XPUType* c_1 = + (dx == NULL) + ? reinterpret_cast(NULL) + : reinterpret_cast(dx->mutable_data(ctx.GetPlace())); + XPUType* c_2 = + (dy == NULL) + ? reinterpret_cast(NULL) + : reinterpret_cast(dy->mutable_data(ctx.GetPlace())); + XpuFcInfo info_dx; + XpuFcInfo info_dy; + std::tuple + fc_info = MatmulGradFcInfo(xpu_ctx, + &RAII_GUARD, + info_forward, + false, + false, + x_ptr, + y_ptr, + dout_ptr); + std::tie(info_dx, info_dy, a_1, b_1, a_2, b_2) = fc_info; if (dx) { - dx->mutable_data(ctx.GetPlace()); - Tensor dx_matrix = dx->dims().size() > 2 - ? framework::ReshapeToMatrix(*dx, x_num_col_dims) - : *dx; - // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - // blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix); - bool trans_a = false; - bool trans_b = true; - int m = dout_mat.dims()[0]; - int k = dout_mat.dims()[1]; - int n = y_matrix.dims()[0]; - int k1 = y_matrix.dims()[1]; - PADDLE_ENFORCE_EQ( - k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op")); - int lda = (!trans_a) ? k : m; - int ldb = (!trans_b) ? n : k; - int ldc = n; - T alpha = static_cast(1.0); - T beta = static_cast(0.0); - const T* data_a = dout->data(); - const T* data_b = y_matrix.data(); - T* data_c = dx_matrix.data(); - - int ret = xpu_fc_wrapper( - dev_ctx.x_context(), - reinterpret_cast(data_a), - reinterpret_cast(data_b), - reinterpret_cast(data_c), - m, - n, - k, - trans_a, - trans_b, - nullptr, - nullptr, - nullptr, - lda, - ldb, - ldc, - alpha, - beta, - nullptr, - xpu::Activation_t::LINEAR); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper"); + MatMulXPUFunction(xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - Tensor dy_matrix = dy->dims().size() > 2 - ? framework::ReshapeToMatrix(*dy, y_num_col_dims) - : *dy; - // dy = x' * dout. dy K x N, dout : M x N, x : M x K - // blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix); - bool trans_a = true; - bool trans_b = false; - int k = x_matrix.dims()[0]; - int m = x_matrix.dims()[1]; - int k1 = dout_mat.dims()[0]; - int n = dout_mat.dims()[1]; - PADDLE_ENFORCE_EQ( - k, k1, platform::errors::InvalidArgument("Shape mistake in mul_op")); - int lda = (!trans_a) ? k : m; - int ldb = (!trans_b) ? n : k; - int ldc = n; - T alpha = static_cast(1.0); - T beta = static_cast(0.0); - const T* data_a = x_matrix.data(); - const T* data_b = dout->data(); - T* data_c = dy_matrix.data(); - - int ret = xpu_fc_wrapper( - dev_ctx.x_context(), - reinterpret_cast(data_a), - reinterpret_cast(data_b), - reinterpret_cast(data_c), - m, - n, - k, - trans_a, - trans_b, - nullptr, - nullptr, - nullptr, - lda, - ldb, - ldc, - alpha, - beta, - nullptr, - xpu::Activation_t::LINEAR); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "xpu_fc_wrapper"); + MatMulXPUFunction(xpu_ctx, a_2, b_2, c_2, info_dy, 1.0f); } } }; diff --git a/paddle/fluid/operators/xpu_api_wrapper.h b/paddle/fluid/operators/xpu_api_wrapper.h index 8d51e53e8b394..c85a765f3b6fd 100644 --- a/paddle/fluid/operators/xpu_api_wrapper.h +++ b/paddle/fluid/operators/xpu_api_wrapper.h @@ -12,42 +12,206 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { namespace operators { +using float16 = typename XPUTypeTrait::Type; + +enum XPUFCCalcType { + FC_INT16 = 0, + FC_INT32, + FC_FLOAT, +}; + +template +XPUFCCalcType FCCalcType() { + if (std::is_same::value || + std::is_same::value) { + return XPUFCCalcType::FC_INT16; + } else if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) { + return XPUFCCalcType::FC_INT32; + } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) { + return XPUFCCalcType::FC_FLOAT; + } + return XPUFCCalcType::FC_INT16; +} + +struct XpuFcInfo { + int bs; + int m; + int n; + int k; + bool trans_x; + bool trans_y; + int stride_x; + int stride_y; + int stride_out; + float* max_x; + float* max_y; + float* max_out; + XpuFcInfo() + : bs(0), + m(0), + n(0), + k(0), + trans_x(false), + trans_y(false), + stride_x(0), + stride_y(0), + stride_out(0), + max_x(nullptr), + max_y(nullptr), + max_out(nullptr) {} + void InitFcInfo(int bs, + int m, + int n, + int k, + bool trans_x, + bool trans_y, + float* max_x, + float* max_y, + float* max_out) { + this->bs = bs; + this->m = m; + this->n = n; + this->k = k; + this->trans_x = trans_x; + this->trans_y = trans_y; + this->max_x = max_x; + this->max_y = max_y; + this->max_out = max_out; + + if (this->bs <= 1) { + this->stride_x = trans_x ? m : k; + this->stride_y = trans_y ? k : n; + this->stride_out = n; + } else { + this->stride_x = m * k; + this->stride_y = k * n; + this->stride_out = m * n; + } + } +}; + +static std::ostream& operator<<(std::ostream& os, const XpuFcInfo& fc_inf) { + os << "fc_inf[ bs, m, n, k, trans_x, trans_y, stride_x, stride_y, " + "stride_out] = " + << "[" << fc_inf.bs << ", " << fc_inf.m << ", " << fc_inf.n << ", " + << fc_inf.k << ", " << fc_inf.trans_x << ", " << fc_inf.trans_y << ", " + << fc_inf.stride_x << ", " << fc_inf.stride_y << ", " << fc_inf.stride_out; + return os; +} + +static void GetFCInfo(const phi::DDim& x_dims, + const phi::DDim& y_dims, + bool trans_x, + bool trans_y, + XpuFcInfo* info) { + framework::DDim new_x_dims = + (x_dims.size() > 1) ? x_dims : phi::make_ddim({1, x_dims[0]}); + framework::DDim new_y_dims = + (y_dims.size() > 1) ? y_dims : phi::make_ddim({y_dims[0], 1}); + + auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(new_x_dims, 0, trans_x); + auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(new_y_dims, 0, trans_y); + + if (x_dims.size() >= 3 && y_dims.size() <= 2) { + if (!trans_x) { + mat_dim_a.height_ *= mat_dim_a.batch_size_; + mat_dim_a.batch_size_ = 0; + } else { + mat_dim_b.batch_size_ = mat_dim_a.batch_size_; + mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_; + } + } + + if (y_dims.size() >= 3 && x_dims.size() <= 2) { + PADDLE_ENFORCE_EQ( + mat_dim_b.trans_, + false, + platform::errors::InvalidArgument( + "xpu not support this Shape in matmul_op xdims = %s ydims = %s " + "x_trans = %d y_trans = %d", + x_dims.to_str(), + y_dims.to_str(), + mat_dim_a.trans_, + mat_dim_b.trans_)); + mat_dim_b.height_ *= mat_dim_b.batch_size_; + mat_dim_b.batch_size_ = 0; + } + + if (mat_dim_a.width_ == mat_dim_b.height_) { + if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) { + mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0; + } + if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) { + mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0; + } + } + + PADDLE_ENFORCE_EQ(mat_dim_a.width_, + mat_dim_b.height_, + platform::errors::InvalidArgument( + "Shape mistake in matmul_op xdims = %s ydims = %s " + "x_trans = %d y_trans = %d", + x_dims.to_str(), + y_dims.to_str(), + mat_dim_a.trans_, + mat_dim_b.trans_)); + + info->m = mat_dim_a.height_; + info->n = mat_dim_b.width_; + info->k = mat_dim_a.width_; + info->bs = mat_dim_a.batch_size_; + info->trans_x = trans_x; + info->trans_y = trans_y; + + if (info->bs <= 1) { + info->stride_x = trans_x ? info->m : info->k; + info->stride_y = trans_y ? info->k : info->n; + info->stride_out = info->n; + } else { + info->stride_x = info->m * info->k; + info->stride_y = info->k * info->n; + info->stride_out = info->m * info->n; + } +} + template -int xpu_fc_wrapper(xpu::Context* ctx, - const XPUType* x, - const XPUType* w, - XPUType* y, - int m, - int n, - int k, - bool x_trans, - bool w_trans, - const float* x_maxptr, - const float* w_maxptr, - float* y_maxptr, - int ldx, - int ldw, - int ldy, - float alpha, - float beta, - const float* bias, - const xpu::Activation_t& act) { +static void xpu_fc_wrapper(xpu::Context* ctx, + const XPUType* x, + const XPUType* w, + XPUType* y, + int m, + int n, + int k, + bool x_trans, + bool w_trans, + const float* x_maxptr, + const float* w_maxptr, + float* y_maxptr, + int ldx, + int ldw, + int ldy, + float alpha, + float beta, + const float* bias, + const xpu::Activation_t& act) { int r = 0; if (x_trans && std::getenv("XPU_PADDLE_FC_TRANS_A") != nullptr && std::is_same::value) { XPUType* l3_addr = nullptr; xpu::ctx_guard RAII_GUARD(ctx); l3_addr = RAII_GUARD.alloc_l3_or_gm(m * k); - if (l3_addr == nullptr) return XPUERR_NOMEM; + PADDLE_ENFORCE_XDNN_NOT_NULL(l3_addr); std::vector shape = {k, m}; std::vector axis = {1, 0}; r = xpu::transpose(ctx, x, l3_addr, shape, axis); - if (r != XPU_SUCCESS) return r; + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); r = xpu::fc_fusion(ctx, l3_addr, @@ -68,7 +232,7 @@ int xpu_fc_wrapper(xpu::Context* ctx, beta, bias, act); - if (r != XPU_SUCCESS) return r; + PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_fusion"); } else { r = xpu::fc_fusion(ctx, x, @@ -89,8 +253,356 @@ int xpu_fc_wrapper(xpu::Context* ctx, beta, bias, act); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_fusion"); } - return r; +} + +template <> +void xpu_fc_wrapper(xpu::Context* ctx, + const float16* x, + const float16* w, + float16* y, + int m, + int n, + int k, + bool x_trans, + bool w_trans, + const float* x_maxptr, + const float* w_maxptr, + float* y_maxptr, + int ldx, + int ldw, + int ldy, + float alpha, + float beta, + const float* bias, + const xpu::Activation_t& act) { + int r = xpu::Error_t::INVALID_PARAM; + PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_fc_wrapper"); +} + +template +static void xpu_fc_batch_wrapper(xpu::Context* xpu_ctx, + int bs, + bool trans_x, + bool trans_w, + int m, + int n, + int k, + float alpha, + const XPUType* x, + int stride_x, + const XPUType* w, + int stride_w, + float beta, + XPUType* y, + int stride_y, + const float* x_maxptr, + const float* w_maxptr) { + int r = xpu::fc_batched( + xpu_ctx, // Context* ctx, + bs, // int batch_size, + trans_x, // bool x_trans, + trans_w, // bool w_trans, + m, // int m, + n, // int n, + k, // int k, + alpha, // float alpha, + reinterpret_cast(x), // const TX* x, + stride_x, // int stride_a, + reinterpret_cast(w), // const TW* w, + stride_w, // int stride_b, + 0.0, // float beta, + reinterpret_cast(y), // TY* y, + stride_y, // int stride_c, + x_maxptr, // const float* x_maxptr, + w_maxptr); // const float* w_maxptr + PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_batched"); +} + +template <> +void xpu_fc_batch_wrapper(xpu::Context* xpu_ctx, + int bs, + bool trans_x, + bool trans_w, + int m, + int n, + int k, + float alpha, + const float16* x, + int stride_x, + const float16* w, + int stride_w, + float beta, + float16* y, + int stride_y, + const float* x_maxptr, + const float* w_maxptr) { + int r = xpu::Error_t::INVALID_PARAM; + PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_fc_batch_wrapper"); +} + +template <> +void xpu_fc_batch_wrapper(xpu::Context* xpu_ctx, + int bs, + bool trans_x, + bool trans_w, + int m, + int n, + int k, + float alpha, + const float16* x, + int stride_x, + const float16* w, + int stride_w, + float beta, + float16* y, + int stride_y, + const float* x_maxptr, + const float* w_maxptr) { + int r = xpu::Error_t::INVALID_PARAM; + PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_fc_batch_wrapper"); +} + +template +static void MatMulXPUFunction(xpu::Context* xpu_ctx, + const T* x, + const T* y, + T* out, + const XpuFcInfo& fcinfo, + float alpha) { + using XPUType = typename XPUTypeTrait::Type; + using float16 = typename XPUTypeTrait::Type; + int fccal_type = FCCalcType(); + + decltype(&paddle::operators::xpu_fc_wrapper) + fc_api_list[3] = { + &paddle::operators::xpu_fc_wrapper, + &paddle::operators::xpu_fc_wrapper, + &paddle::operators::xpu_fc_wrapper, + }; + decltype(&paddle::operators::xpu_fc_batch_wrapper) + fc_batch_api_list[3] = { + &paddle::operators::xpu_fc_batch_wrapper, + &paddle::operators::xpu_fc_batch_wrapper, + &paddle::operators::xpu_fc_batch_wrapper, + }; + + auto fc_api = fc_api_list[fccal_type]; + auto fc_batch_api = fc_batch_api_list[fccal_type]; + + int m = fcinfo.m; + int n = fcinfo.n; + int k = fcinfo.k; + int batch_size = fcinfo.bs; + int ldx = fcinfo.stride_x; + int ldy = fcinfo.stride_y; + int ldout = fcinfo.stride_out; + bool trans_x = fcinfo.trans_x; + bool trans_y = fcinfo.trans_y; + float* max_x = fcinfo.max_x; + float* max_y = fcinfo.max_y; + float* max_out = fcinfo.max_out; + + if (batch_size <= 1) { + fc_api(xpu_ctx, + reinterpret_cast(x), + reinterpret_cast(y), + reinterpret_cast(out), + m, + n, + k, + trans_x, + trans_y, + max_x, + max_y, + max_out, + ldx, + ldy, + ldout, + alpha, + 0, + nullptr, + xpu::Activation_t::LINEAR); + } else { + // batch matmul + fc_batch_api(xpu_ctx, // Context* ctx, + batch_size, // int batch_size, + trans_x, // bool x_trans, + trans_y, // bool w_trans, + m, // int m, + n, // int n, + k, // int k, + alpha, // float alpha, + reinterpret_cast(x), // const TX* x, + ldx, // int stride_a, + reinterpret_cast(y), // const TW* w, + ldy, // int stride_b, + 0.0, // float beta, + reinterpret_cast(out), // TY* y, + ldout, // int stride_c, + max_x, // const float* x_maxptr, + max_y); // const float* w_maxptr + } +} + +template +static std::tuple +MatmulGradFcInfo(xpu::Context* xpu_ctx, + xpu::ctx_guard* RAII_GUARD, + const XpuFcInfo& dout_shape, + bool trans_x, + bool trans_y, + const T* x, + const T* y, + const T* dout) { + XpuFcInfo dx_shape, dy_shape; + const T* dx_a = NULL; + const T* dx_b = NULL; + const T* dy_a = NULL; + const T* dy_b = NULL; + bool copy_to_l3 = false; + float* max_dout = NULL; + int maxptr_size = xpu_ctx->max_ptr_size(); + uint64_t l3_size = uint64_t(xpu_ctx->_l3_mgr.get_size()); + int bs = (dout_shape.bs <= 1) ? (1) : (dout_shape.bs); + int dx_size = bs * dout_shape.m * dout_shape.k; + int dy_size = bs * dout_shape.k * dout_shape.n; + int dout_size = bs * dout_shape.m * dout_shape.n; + if (trans_x && trans_y) { + copy_to_l3 = l3_size >= (dout_size * 2 + dy_size) * sizeof(T); + } else if (trans_x) { + copy_to_l3 = l3_size >= dout_size * sizeof(T); + } else if (trans_y) { + copy_to_l3 = l3_size >= dout_size * 2 * sizeof(T); + } else { + copy_to_l3 = l3_size >= (dout_size + dx_size) * sizeof(T); + } + + const T* dout_new = dout; + int r = 0; + if (copy_to_l3) { + T* dout_l3 = RAII_GUARD->alloc_l3(dout_size); + PADDLE_ENFORCE_XDNN_NOT_NULL(dout_l3); + if ((dout_shape.bs > 1) || ((dout_shape.bs <= 1) && + (FCCalcType() == XPUFCCalcType::FC_FLOAT))) { + r = xpu::copy(xpu_ctx, dout, dout_l3, dout_size); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy"); + dout_new = dout_l3; + } else { + max_dout = RAII_GUARD->alloc_l3_or_gm(maxptr_size); + PADDLE_ENFORCE_XDNN_NOT_NULL(max_dout); + + r = xpu::findmax_copy_fusion(xpu_ctx, dout, max_dout, dout_l3, dout_size); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion"); + dout_new = dout_l3; + } + } else if (((dout_shape.bs <= 1) && + (FCCalcType() != XPUFCCalcType::FC_FLOAT))) { + max_dout = RAII_GUARD->alloc_l3_or_gm(maxptr_size); + PADDLE_ENFORCE_XDNN_NOT_NULL(max_dout); + r = xpu::findmax_copy_fusion( + xpu_ctx, dout, max_dout, reinterpret_cast(NULL), dout_size); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion"); + } + + if (trans_x && trans_y) { + // dx = T(y) * T(dout) + dx_shape.InitFcInfo(dout_shape.bs, + dout_shape.k, + dout_shape.m, + dout_shape.n, + true, + true, + nullptr, + max_dout, + nullptr); + dx_a = y, dx_b = dout_new; + // dy = T(dout) * T(x) + dy_shape.InitFcInfo(dout_shape.bs, + dout_shape.n, + dout_shape.k, + dout_shape.m, + true, + true, + max_dout, + nullptr, + nullptr); + dy_a = dout_new, dy_b = x; + } else if (trans_x) { + // dx = y * T(dout) + dx_shape.InitFcInfo(dout_shape.bs, + dout_shape.k, + dout_shape.m, + dout_shape.n, + false, + true, + nullptr, + max_dout, + nullptr); + dx_a = y, dx_b = dout_new; + // dy = x * dout + dy_shape.InitFcInfo(dout_shape.bs, + dout_shape.k, + dout_shape.n, + dout_shape.m, + false, + false, + nullptr, + max_dout, + nullptr); + dy_a = x, dy_b = dout_new; + } else if (trans_y) { + // dx = dout * y + dx_shape.InitFcInfo(dout_shape.bs, + dout_shape.m, + dout_shape.k, + dout_shape.n, + false, + false, + max_dout, + nullptr, + nullptr); + dx_a = dout_new, dx_b = y; + // dy = T(dout) * x + dy_shape.InitFcInfo(dout_shape.bs, + dout_shape.n, + dout_shape.k, + dout_shape.m, + true, + false, + max_dout, + nullptr, + nullptr); + dy_a = dout_new, dy_b = x; + } else { + // dx = dout * T(y) + dx_shape.InitFcInfo(dout_shape.bs, + dout_shape.m, + dout_shape.k, + dout_shape.n, + false, + true, + max_dout, + nullptr, + nullptr); + dx_a = dout_new, dx_b = y; + // dy = T(x) * dout + dy_shape.InitFcInfo(dout_shape.bs, + dout_shape.k, + dout_shape.n, + dout_shape.m, + true, + false, + nullptr, + max_dout, + nullptr); + dy_a = x, dy_b = dout_new; + } + std::tuple + result = std::make_tuple(dx_shape, dy_shape, dx_a, dx_b, dy_a, dy_b); + + return result; } } // namespace operators diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 27339a0f25a8a..2b80396cc3138 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -281,11 +281,18 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, - {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"matmul_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"matmul_v2_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"matmul_v2", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"matmul", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py index d92378f60f578..b4032f2dcb67e 100644 --- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py +++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py @@ -84,7 +84,9 @@ xpu_test_op_white_list = [] xpu_test_device_type_white_list = ['xpu1_float64'] -xpu_test_op_type_white_list = ['dropout_float16', 'dropout_grad_float16'] +xpu_test_op_type_white_list = [ + 'dropout_float16', 'dropout_grad_float16', 'matmul_v2_float16' +] xpu_test_device_op_white_list = [] xpu_test_device_op_type_white_list = [] diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py index bc6fa19a35444..1c68f8fb6bf16 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py @@ -303,7 +303,8 @@ def setUp(self): X = np.random.random(shape_X).astype(self.dtype) Y = np.random.random(shape_Y).astype(self.dtype) - Out = reference_matmul(X, Y, transpose_X, transpose_Y) + Out = reference_matmul(X, Y, transpose_X, + transpose_Y).astype(self.dtype) self.inputs = {'X': X, 'Y': Y} self.attrs = {'transpose_X': transpose_X, 'transpose_Y': transpose_Y} self.outputs = {'Out': Out} From 05b7ef8d78438eca9237cefbc7696d22783d8b49 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Thu, 7 Jul 2022 19:39:56 +0800 Subject: [PATCH 098/250] [Windows CI] copy onnxruntime.dll to c++ test folder in windows (#44121) * copy onnxruntime.dll to c++ test folder in windows * remove ut that failed due to onnxrumtime.dll * test_api_impl failed of diff * use TARGET to make sure if the test exist; use POST_BUILD to add copy command --- cmake/external/onnxruntime.cmake | 12 ++++++++++++ paddle/fluid/framework/ir/CMakeLists.txt | 7 +++++++ .../fluid/inference/analysis/CMakeLists.txt | 5 +++++ paddle/fluid/inference/api/CMakeLists.txt | 6 ++++++ .../inference/api/details/CMakeLists.txt | 6 ++++++ .../fluid/inference/tensorrt/CMakeLists.txt | 7 +++++++ .../inference/tensorrt/convert/CMakeLists.txt | 6 ++++++ paddle/fluid/inference/utils/CMakeLists.txt | 7 +++++++ .../fluid/operators/benchmark/CMakeLists.txt | 6 ++++++ .../fluid/operators/tensorrt/CMakeLists.txt | 6 ++++++ tools/windows/run_unittests.sh | 19 ++----------------- 11 files changed, 70 insertions(+), 17 deletions(-) diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake index b52b2c00d9cce..15901568ae1cd 100644 --- a/cmake/external/onnxruntime.cmake +++ b/cmake/external/onnxruntime.cmake @@ -134,3 +134,15 @@ endif() add_library(onnxruntime STATIC IMPORTED GLOBAL) set_property(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB}) add_dependencies(onnxruntime ${ONNXRUNTIME_PROJECT}) + +function(copy_onnx TARGET_NAME) + # If error of Exitcode0xc000007b happened when a .exe running, copy onnxruntime.dll + # to the .exe folder. + if(TARGET ${TARGET_NAME}) + add_custom_command( + TARGET ${TARGET_NAME} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SHARED_LIB} + ${CMAKE_CURRENT_BINARY_DIR} DEPENDS onnxruntime) + endif() +endfunction() diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 8569a3bb6151f..2e4b73c6ac19a 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -473,6 +473,13 @@ if(WITH_MKLDNN) test_compute_propagate_scales_mkldnn_pass SRCS mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc DEPS compute_propagate_scales_mkldnn_pass naive_executor) + + if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(test_compute_propagate_scales_mkldnn_pass) + endif() + cc_test( test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index c001f5eb8dfdc..67f0e3212db43 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -109,4 +109,9 @@ elseif(WIN32) paddle_inference_api ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR}) + if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(test_analyzer) + endif() endif() diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 0d55b9c66416a..9e601df8088fc 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -99,6 +99,12 @@ cc_test( SRCS api_tester.cc DEPS paddle_inference_api) +if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(test_paddle_inference_api) +endif() + if(WITH_TESTING) if(NOT APPLE AND NOT WIN32) if(WITH_GPU) diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt index 2acd96b3fb97c..02d5f91d630ce 100644 --- a/paddle/fluid/inference/api/details/CMakeLists.txt +++ b/paddle/fluid/inference/api/details/CMakeLists.txt @@ -38,3 +38,9 @@ cc_test( zero_copy_tensor_test SRCS zero_copy_tensor_test.cc DEPS paddle_inference_api) + +if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(zero_copy_tensor_test) +endif() diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 0f1350459ef22..cd03dce1795e2 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -24,5 +24,12 @@ nv_test( test_tensorrt_engine SRCS test_engine.cc test_dynamic_engine.cc DEPS dynload_cuda tensorrt_engine tensorrt_plugin) + +if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(test_tensorrt_engine) +endif() + add_subdirectory(plugin) add_subdirectory(convert) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index c999c009605ee..90089fcbfd806 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -85,3 +85,9 @@ nv_test( SRCS test_op_converter.cc DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter) + +if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(test_op_converter) +endif() diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index 9ab07633e0fe0..f165002f353e4 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -18,6 +18,13 @@ cc_test( infer_io_utils_tester SRCS io_utils_tester.cc DEPS infer_io_utils) + +if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(infer_io_utils_tester) +endif() + cc_library(table_printer SRCS table_printer.cc) cc_test( test_table_printer diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt index e05011eaf6b3a..b0a1c488f047c 100644 --- a/paddle/fluid/operators/benchmark/CMakeLists.txt +++ b/paddle/fluid/operators/benchmark/CMakeLists.txt @@ -12,3 +12,9 @@ cc_test( ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} eigen_function) + +if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(op_tester) +endif() diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt index e0fed2804a9b7..0d731b14c6a97 100644 --- a/paddle/fluid/operators/tensorrt/CMakeLists.txt +++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt @@ -4,3 +4,9 @@ nv_test( test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc DEPS tensorrt_engine_op analysis) + +if(WITH_ONNXRUNTIME AND WIN32) + # Copy onnxruntime for some c++ test in Windows, since the test will + # be build only in CI, so suppose the generator in Windows is Ninja. + copy_onnx(test_tensorrt_engine_op) +endif() diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 23a0b4d32828f..7af1cd81391d4 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -87,8 +87,6 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^lite_mul_model_test$|\ ^trt_split_converter_test$|\ ^paddle_infer_api_copy_tensor_tester$|\ -^test_tensorrt_engine_op$|\ -^test_tensorrt_engine$|\ ^test_trt_deformable_conv$|\ ^test_imperative_triple_grad$|\ ^test_full_name_usage$|\ @@ -103,7 +101,6 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_tensor_scalar_type_promotion_static$|\ ^test_matrix_power_op$|\ ^test_deformable_conv_v1_op$|\ -^zero_copy_tensor_test$|\ ^test_where_index$|\ ^test_custom_grad_input$|\ ^test_conv3d_transpose_op$|\ @@ -116,16 +113,6 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_basic_api_transformation$|\ ^test_deformable_conv_op$|\ ^test_variable$|\ -^test_conv_bias_mkldnn_fuse_pass_cc$|\ -^test_conv_batch_norm_mkldnn_fuse_pass$|\ -^test_compute_propagate_scales_mkldnn_pass$|\ -^test_cpu_quantize_pass$|\ -^test_cpu_quantize_squash_pass$|\ -^op_tester$|\ -^test_analyzer$|\ -^infer_io_utils_tester$|\ -^test_paddle_inference_api$|\ -^test_mkldnn_quantizer$|\ ^test_mkldnn_conv_hard_sigmoid_fuse_pass$|\ ^test_mkldnn_conv_hard_swish_fuse_pass$|\ ^test_conv_act_mkldnn_fuse_pass$|\ @@ -147,11 +134,9 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_slice$|\ ^test_conv_elementwise_add_fuse_pass$|\ ^test_executor_and_mul$|\ -^test_op_converter$|\ ^test_analyzer_int8_resnet50$|\ ^test_analyzer_int8_mobilenetv1$|\ ^test_trt_conv_pass$|\ -^test_analysis_predictor$|\ ^test_roll_op$|\ ^test_lcm$|\ ^test_elementwise_floordiv_op$|\ @@ -160,7 +145,6 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_trt_convert_deformable_conv$|\ ^test_conv_elementwise_add2_act_fuse_pass$|\ ^test_tensor_scalar_type_promotion_dynamic$|\ -^test_api_impl$|\ ^test_model$|\ ^test_py_reader_combination$|\ ^test_trt_convert_flatten$|\ @@ -198,7 +182,8 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_trt_fc_fuse_quant_dequant_pass$|\ ^test_unsqueeze2_eltwise_fuse_pass$|\ ^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\ -^test_parallel_executor_seresnext_with_reduce_gpu$" +^test_parallel_executor_seresnext_with_reduce_gpu$|\ +^test_api_impl$" # /*==========Fixed Disabled Windows CPU OPENBLAS((PR-CI-Windows-OPENBLAS)) unittests==============================*/ From fa6333f9d670192cf316bd0f5dc20984d40dcca8 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Thu, 7 Jul 2022 19:52:04 +0800 Subject: [PATCH 099/250] Fix pr build size (#44163) --- paddle/scripts/paddle_build.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 4e6e5eeee4770..cbfd401d30b9d 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -3306,9 +3306,10 @@ function check_coverage_build() { rm -f build_size curl -O https://paddle-docker-tar.bj.bcebos.com/paddle_ci_index/build_size - curl -O https://xly-devops.bj.bcebos.com/PR/build_whl/${AGILE_PULL_ID}/${AGILE_REVISION}/coverage_build_size + #curl -O https://xly-devops.bj.bcebos.com/PR/build_whl/${AGILE_PULL_ID}/${AGILE_REVISION}/coverage_build_size + #pr_coverage_build_size=`cat coverage_build_size|sed 's#G##g'` dev_coverage_build_size=`cat build_size|sed 's#G##g'` - pr_coverage_build_size=`cat coverage_build_size|sed 's#G##g'` + pr_coverage_build_size=`echo $buildSize|sed 's#G##g'` diff_coverage_build_size=`echo $(($pr_coverage_build_size - $dev_coverage_build_size))` From 337bb47b5e220c387961b5d1cf7aa5c7b984b507 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 7 Jul 2022 10:11:43 -0500 Subject: [PATCH 100/250] Refine dist kernel, reuse norm (#44154) * refine dist kernel, reuse norm * follow comments --- paddle/phi/kernels/{cpu => }/dist_kernel.cc | 21 ++- paddle/phi/kernels/gpu/dist_kernel.cu | 27 ---- paddle/phi/kernels/impl/dist_kernel_impl.h | 166 -------------------- 3 files changed, 20 insertions(+), 194 deletions(-) rename paddle/phi/kernels/{cpu => }/dist_kernel.cc (57%) delete mode 100644 paddle/phi/kernels/gpu/dist_kernel.cu delete mode 100644 paddle/phi/kernels/impl/dist_kernel_impl.h diff --git a/paddle/phi/kernels/cpu/dist_kernel.cc b/paddle/phi/kernels/dist_kernel.cc similarity index 57% rename from paddle/phi/kernels/cpu/dist_kernel.cc rename to paddle/phi/kernels/dist_kernel.cc index 0c7b5db64b38f..ed1fa0dafe741 100644 --- a/paddle/phi/kernels/cpu/dist_kernel.cc +++ b/paddle/phi/kernels/dist_kernel.cc @@ -16,6 +16,25 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/dist_kernel_impl.h" +#include "paddle/phi/kernels/elementwise_subtract_kernel.h" +#include "paddle/phi/kernels/p_norm_kernel.h" + +namespace phi { + +template +void DistKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + float p, + DenseTensor* out) { + auto t = Subtract(dev_ctx, x, y); + PNormKernel(dev_ctx, t, p, -1, 1e-12, false, true, out); +} + +} // namespace phi PD_REGISTER_KERNEL(dist, CPU, ALL_LAYOUT, phi::DistKernel, float, double) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float, double) {} +#endif diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu deleted file mode 100644 index 095110c252978..0000000000000 --- a/paddle/phi/kernels/gpu/dist_kernel.cu +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/dist_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/dist_kernel_impl.h" - -#ifdef PADDLE_WITH_HIP -// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922 -// do not support double in HIPCC platform (Eigen3 to be fixed) -PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float) {} -#else -PD_REGISTER_KERNEL(dist, GPU, ALL_LAYOUT, phi::DistKernel, float, double) {} -#endif diff --git a/paddle/phi/kernels/impl/dist_kernel_impl.h b/paddle/phi/kernels/impl/dist_kernel_impl.h deleted file mode 100644 index c4ee7cec34750..0000000000000 --- a/paddle/phi/kernels/impl/dist_kernel_impl.h +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include -#include - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace phi { - -template -using ETensor = phi::EigenTensor; - -template -static void GetBraodcastDims(const phi::DDim& x_dims, - const phi::DDim& y_dims, - Eigen::DSizes* x_bcast_dims, - Eigen::DSizes* y_bcast_dims) { - int bcast_dims_remainder = 0; - for (int i = 0; i < x_dims.size(); ++i) { - if (x_dims[i] >= y_dims[i]) { - (*x_bcast_dims)[i] = 1; - (*y_bcast_dims)[i] = x_dims[i] / y_dims[i]; - bcast_dims_remainder += x_dims[i] % y_dims[i]; - } else { - (*y_bcast_dims)[i] = 1; - (*x_bcast_dims)[i] = y_dims[i] / x_dims[i]; - bcast_dims_remainder += y_dims[i] % x_dims[i]; - } - } - PADDLE_ENFORCE_EQ(bcast_dims_remainder, - 0, - phi::errors::PreconditionNotMet( - "The input tensor of Op(dist) could not be broadcast, " - "X's shape is [%s], Y's shape is [%s].", - x_dims, - y_dims)); -} - -static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) { - std::vector new_dims_vec(rank); - if (in_dims.size() < rank) { - for (int i = 0; i < rank - in_dims.size(); ++i) { - new_dims_vec[i] = 1; - } - for (int i = 0; i < in_dims.size(); ++i) { - new_dims_vec[i + rank - in_dims.size()] = in_dims[i]; - } - } else { - new_dims_vec = vectorize(in_dims); - } - return phi::make_ddim(new_dims_vec); -} - -template -static void DistFunction(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - float p, - DenseTensor* out) { - if (out) { - dev_ctx.template Alloc(out); - } - auto x_dims = x.dims(); - auto y_dims = y.dims(); - - // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3)) - phi::DDim x_new_dims = GetNewDims(x_dims, Rank); - phi::DDim y_new_dims = GetNewDims(y_dims, Rank); - - auto x_t = ETensor::From(x, x_new_dims); - auto y_t = ETensor::From(y, y_new_dims); - auto out_t = ETensor::From(*out); - auto& place = *dev_ctx.eigen_device(); - - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); - // p=0 means number of non-zero elements of (x-y) - // p=inf means the maximum of |x-y| - // p=-inf means the minimum of |x-y| - // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p) - if (p == 0) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims)) - .template cast() - .sum(); - } else if (p == INFINITY) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .maximum(); - } else if (p == -INFINITY) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .minimum(); - } else { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .pow(p) - .sum() - .pow(1.0 / p); - } -} - -template -void DistKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - float p, - DenseTensor* out) { - auto x_rank = x.dims().size(); - auto y_rank = y.dims().size(); - auto rank = std::max(x_rank, y_rank); - PADDLE_ENFORCE_LE(rank, - 6, - phi::errors::Unimplemented( - "Op(dist) only support tensors with no more than 6 " - "dimensions, but X's rank is %d, Y's rank is %d.", - x_rank, - y_rank)); - switch (rank) { - case 1: - DistFunction(dev_ctx, x, y, p, out); - break; - case 2: - DistFunction(dev_ctx, x, y, p, out); - break; - case 3: - DistFunction(dev_ctx, x, y, p, out); - break; - case 4: - DistFunction(dev_ctx, x, y, p, out); - break; - case 5: - DistFunction(dev_ctx, x, y, p, out); - break; - case 6: - DistFunction(dev_ctx, x, y, p, out); - break; - } -} - -} // namespace phi From d7be46b3ebe01d11c5e355cd4a72bdbbfcf1fde5 Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Fri, 8 Jul 2022 10:00:55 +0800 Subject: [PATCH 101/250] add implement of resnet_basic_block op for XPU2, test=kunlun (#44143) --- .../operators/fused/resnet_basic_block_op.cc | 37 +- .../fused/resnet_basic_block_op_xpu.cc | 970 ++++++++++++++++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 8 + 3 files changed, 997 insertions(+), 18 deletions(-) create mode 100644 paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op.cc b/paddle/fluid/operators/fused/resnet_basic_block_op.cc index d54a889f93aa6..5990db8147be4 100644 --- a/paddle/fluid/operators/fused/resnet_basic_block_op.cc +++ b/paddle/fluid/operators/fused/resnet_basic_block_op.cc @@ -258,24 +258,25 @@ class ResNetBasicBlockOp : public framework::OperatorWithKernel { class ResNetBasicBlockOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { - // has_shortcut = True: X else: X - // / / - // | | | | - // CONV1 | CONV1 | - // | | | | - // BN1 | BN1 | - // | | | | - // RELU1 | RELU1 | - // | | | | - // CONV2 CONV3 CONV2 | - // | | | | - // BN2 BN3 BN2 | - // \ / \ / - // ADD ADD - // | | - // RELU RELU - // | | - // Y Y + // has_shortcut = True: else: + // X X + // / / + // | | | | + // CONV1 | CONV1 | + // | | | | + // BN1 | BN1 | + // | | | | + // RELU1 | RELU1 | + // | | | | + // CONV2 CONV3 CONV2 | + // | | | | + // BN2 BN3 BN2 | + // \ / \ / + // ADD ADD + // | | + // RELU RELU + // | | + // Y Y AddInput("X", "Input tensor of conv 1"); AddInput("Filter1", "Filter tensor of conv 1"); AddInput("Scale1", "Scale tensor of bn 1"); diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc new file mode 100644 index 0000000000000..c7a6620c75f8e --- /dev/null +++ b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc @@ -0,0 +1,970 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/conv_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device/xpu/xpu_header.h" +#include "paddle/phi/api/all.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class ResnetBasicBlockAttr { + public: + explicit ResnetBasicBlockAttr(const framework::ExecutionContext& ctx) { + padding1 = ctx.Attr("padding1"); + padding2 = ctx.Attr("padding2"); + padding3 = ctx.Attr("padding3"); + stride1 = ctx.Attr("stride1"); + stride2 = ctx.Attr("stride2"); + stride3 = ctx.Attr("stride3"); + dilation1 = ctx.Attr("dilation1"); + dilation2 = ctx.Attr("dilation2"); + dilation3 = ctx.Attr("dilation3"); + group = ctx.Attr("group"); + + eps = static_cast(ctx.Attr("epsilon")); + momentum = static_cast(ctx.Attr("momentum")); + has_shortcut = ctx.Attr("has_shortcut"); + find_max = ctx.Attr("find_conv_input_max"); + + const auto is_test = ctx.Attr("is_test"); + const auto use_global_stats = ctx.Attr("use_global_stats"); + const auto trainable_stats = ctx.Attr("trainable_statistics"); + bool test_mode = is_test && (!trainable_stats); + global_stats = test_mode || use_global_stats; + + // init shape + auto input1 = ctx.Input("X"); + auto filter1 = ctx.Input("Filter1"); + auto conv1_out = ctx.Output("Conv1"); + auto filter2 = ctx.Input("Filter2"); + auto conv2_out = ctx.Output("Conv2"); + conv1_input_shape = phi::vectorize(input1->dims()); + conv1_output_shape = phi::vectorize(conv1_out->dims()); + conv1_filter_shape = phi::vectorize(filter1->dims()); + conv1_filter_numel = filter1->numel(); + conv1_input_numel = input1->numel(); + conv1_output_numel = conv1_out->numel(); + + conv2_input_shape = phi::vectorize(conv1_out->dims()); + conv2_output_shape = phi::vectorize(conv2_out->dims()); + conv2_filter_shape = phi::vectorize(filter2->dims()); + conv2_filter_numel = filter2->numel(); + conv2_input_numel = conv1_out->numel(); + conv2_output_numel = conv2_out->numel(); + + if (has_shortcut) { + auto filter3 = ctx.Input("Filter3"); + auto conv3_out = ctx.Output("Conv3"); + conv3_input_shape = phi::vectorize(input1->dims()); + conv3_output_shape = phi::vectorize(conv3_out->dims()); + conv3_filter_shape = phi::vectorize(filter3->dims()); + conv3_filter_numel = filter3->numel(); + conv3_input_numel = input1->numel(); + conv3_output_numel = conv3_out->numel(); + } + } + + int padding1; + int padding2; + int padding3; + int stride1; + int stride2; + int stride3; + int dilation1; + int dilation2; + int dilation3; + int group; + + double eps; + double momentum; + + bool has_shortcut; + bool find_max; + bool global_stats; + + std::vector conv1_input_shape; + std::vector conv1_output_shape; + std::vector conv1_filter_shape; + std::vector conv2_input_shape; + std::vector conv2_output_shape; + std::vector conv2_filter_shape; + std::vector conv3_input_shape; + std::vector conv3_output_shape; + std::vector conv3_filter_shape; + + int conv1_filter_numel; + int conv2_filter_numel; + int conv3_filter_numel; + int conv1_input_numel; + int conv2_input_numel; + int conv3_input_numel; + int conv1_output_numel; + int conv2_output_numel; + int conv3_output_numel; +}; + +class ResnetBasicBlockGradAttr { + public: + explicit ResnetBasicBlockGradAttr(const framework::ExecutionContext& ctx) { + padding1 = ctx.Attr("padding1"); + padding2 = ctx.Attr("padding2"); + padding3 = ctx.Attr("padding3"); + stride1 = ctx.Attr("stride1"); + stride2 = ctx.Attr("stride2"); + stride3 = ctx.Attr("stride3"); + dilation1 = ctx.Attr("dilation1"); + dilation2 = ctx.Attr("dilation2"); + dilation3 = ctx.Attr("dilation3"); + group = ctx.Attr("group"); + + has_shortcut = ctx.Attr("has_shortcut"); + find_max = ctx.Attr("find_conv_input_max"); + + // init shape + auto input1 = ctx.Input("X"); + auto filter1 = ctx.Input("Filter1"); + auto conv1_out = ctx.Input("Conv1"); + auto filter2 = ctx.Input("Filter2"); + auto conv2_out = ctx.Input("Conv2"); + conv1_input_shape = phi::vectorize(input1->dims()); + conv1_output_shape = phi::vectorize(conv1_out->dims()); + conv1_filter_shape = phi::vectorize(filter1->dims()); + conv1_filter_numel = filter1->numel(); + conv1_input_numel = input1->numel(); + conv1_output_numel = conv1_out->numel(); + + conv2_input_shape = phi::vectorize(conv1_out->dims()); + conv2_output_shape = phi::vectorize(conv2_out->dims()); + conv2_filter_shape = phi::vectorize(filter2->dims()); + conv2_filter_numel = filter2->numel(); + conv2_input_numel = conv1_out->numel(); + conv2_output_numel = conv2_out->numel(); + + if (has_shortcut) { + auto filter3 = ctx.Input("Filter3"); + auto conv3_out = ctx.Input("Conv3"); + conv3_input_shape = phi::vectorize(input1->dims()); + conv3_output_shape = phi::vectorize(conv3_out->dims()); + conv3_filter_shape = phi::vectorize(filter3->dims()); + conv3_filter_numel = filter3->numel(); + conv3_input_numel = input1->numel(); + conv3_output_numel = conv3_out->numel(); + } + } + + int padding1; + int padding2; + int padding3; + int stride1; + int stride2; + int stride3; + int dilation1; + int dilation2; + int dilation3; + int group; + + bool has_shortcut; + bool find_max; + + std::vector conv1_input_shape; + std::vector conv1_output_shape; + std::vector conv1_filter_shape; + std::vector conv2_input_shape; + std::vector conv2_output_shape; + std::vector conv2_filter_shape; + std::vector conv3_input_shape; + std::vector conv3_output_shape; + std::vector conv3_filter_shape; + + int conv1_filter_numel; + int conv2_filter_numel; + int conv3_filter_numel; + int conv1_input_numel; + int conv2_input_numel; + int conv3_input_numel; + int conv1_output_numel; + int conv2_output_numel; + int conv3_output_numel; +}; + +template +static inline void xpu_conv2d(xpu::Context* ctx, + const T* input_data, + const T* filter_data, + T* output_data, + float* input_max_data, + float* filter_max_data, + const std::vector& input_shape, + const std::vector& filter_shape, + int padding, + int stride, + int dilation, + int group) { + std::vector ksize{filter_shape[2], filter_shape[3]}; + std::vector stride_vec{stride, stride}; + std::vector dilation_vec{dilation, dilation}; + std::vector padding_vec{padding, padding}; + int N = input_shape[0]; + int C = input_shape[1]; + int H = input_shape[2]; + int W = input_shape[3]; + + int r = xpu::conv2d(ctx, + input_data, + filter_data, + output_data, + N, + C, + H, + W, + filter_shape[0], + ksize, + stride_vec, + padding_vec, + dilation_vec, + group, + input_max_data, + filter_max_data, + nullptr, + true); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d"); +} + +template +static inline void xpu_conv2d_grad(xpu::Context* ctx, + const T* input_data, + const T* filter_data, + const T* output_grad_data, + T* input_grad_data, + T* filter_grad_data, + const float* input_max_data, + const float* filter_max_data, + const std::vector& input_shape, + const std::vector& filter_shape, + int padding, + int stride, + int dilation, + int group) { + std::vector ksize{filter_shape[2], filter_shape[3]}; + std::vector stride_vec{stride, stride}; + std::vector dilation_vec{dilation, dilation}; + std::vector padding_vec{padding, padding}; + int N = input_shape[0]; + int C = input_shape[1]; + int H = input_shape[2]; + int W = input_shape[3]; + + int r = xpu::conv2d_grad(ctx, + input_data, + filter_data, + output_grad_data, + input_grad_data, + filter_grad_data, + N, + C, + H, + W, + filter_shape[0], + ksize, + stride_vec, + padding_vec, + dilation_vec, + group, + input_max_data, + filter_max_data, + nullptr, + nullptr, + nullptr, + true); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_grad"); +} + +template +class ResNetBasicBlockXPUKernel : public framework::OpKernel { + public: + using XPUT = typename XPUTypeTrait::Type; + + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_xpu_place(ctx.GetPlace()), + true, + platform::errors::PreconditionNotMet("It must use XPUPlace.")); + + // input + const Tensor* x = ctx.Input("X"); + const Tensor* filter1 = ctx.Input("Filter1"); + const Tensor* scale1 = ctx.Input("Scale1"); + const Tensor* bias1 = ctx.Input("Bias1"); + const Tensor* filter2 = ctx.Input("Filter2"); + const Tensor* scale2 = ctx.Input("Scale2"); + const Tensor* bias2 = ctx.Input("Bias2"); + + // output + Tensor* conv1_output = ctx.Output("Conv1"); + Tensor* conv2_output = ctx.Output("Conv2"); + Tensor* conv2_input = ctx.Output("Conv2Input"); + Tensor* output = ctx.Output("Y"); + + auto place = ctx.GetPlace(); + auto x_data = reinterpret_cast(x->data()); + auto conv1_filter_data = reinterpret_cast(filter1->data()); + auto conv2_filter_data = reinterpret_cast(filter2->data()); + auto conv1_output_data = + reinterpret_cast(conv1_output->mutable_data(place)); + auto conv2_input_data = + reinterpret_cast(conv2_input->mutable_data(place)); + auto conv2_output_data = + reinterpret_cast(conv2_output->mutable_data(place)); + auto scale1_data = scale1->data(); + auto scale2_data = scale2->data(); + auto bias1_data = bias1->data(); + auto bias2_data = bias2->data(); + auto output_data = reinterpret_cast(output->mutable_data(place)); + + float* conv1_input_max_data = nullptr; + float* conv1_filter_max_data = nullptr; + float* conv2_input_max_data = nullptr; + float* conv2_filter_max_data = nullptr; + float* conv3_input_max_data = nullptr; + float* conv3_filter_max_data = nullptr; + + ResnetBasicBlockAttr attr(ctx); + + // init find max + if (attr.find_max) { + Tensor* max_input1 = ctx.Output("MaxInput1"); + Tensor* max_filter1 = ctx.Output("MaxFilter1"); + conv1_input_max_data = max_input1->mutable_data(place); + conv1_filter_max_data = max_filter1->mutable_data(place); + + Tensor* max_input2 = ctx.Output("MaxInput2"); + Tensor* max_filter2 = ctx.Output("MaxFilter2"); + conv2_input_max_data = max_input2->mutable_data(place); + conv2_filter_max_data = max_filter2->mutable_data(place); + + if (attr.has_shortcut) { + Tensor* max_input3 = ctx.Output("MaxInput3"); + Tensor* max_filter3 = ctx.Output("MaxFilter3"); + conv3_input_max_data = max_input3->mutable_data(place); + conv3_filter_max_data = max_filter3->mutable_data(place); + } + } + + auto& dev_ctx = ctx.template device_context(); + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int r = XPU_SUCCESS; + + // 1. short + const XPUT* z_out_data = nullptr; + if (attr.has_shortcut) { + Tensor* conv3_out = ctx.Output("Conv3"); + const Tensor* filter3 = ctx.Input("Filter3"); + auto conv3_filter_data = + reinterpret_cast(filter3->data()); + auto conv3_output_data = + reinterpret_cast(conv3_out->mutable_data(place)); + + XPUT* conv3_input_l3_data = nullptr; + XPUT* conv3_filter_l3_data = + RAII_GUARD.alloc_l3(attr.conv3_filter_numel); + + if (attr.find_max) { + r = xpu::findmax_copy_fusion(dev_ctx.x_context(), + x_data, + conv3_input_max_data, + conv3_input_l3_data, + attr.conv3_input_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion"); + + r = xpu::findmax_copy_fusion(dev_ctx.x_context(), + conv3_filter_data, + conv3_filter_max_data, + conv3_filter_l3_data, + attr.conv3_filter_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion"); + } + + xpu_conv2d(dev_ctx.x_context(), + conv3_input_l3_data != nullptr ? conv3_input_l3_data : x_data, + conv3_filter_l3_data, + conv3_output_data, + conv3_input_max_data, + conv3_filter_max_data, + attr.conv3_input_shape, + attr.conv3_filter_shape, + attr.padding3, + attr.stride3, + attr.dilation3, + attr.group); + + // bn3 + const Tensor* scale3 = ctx.Input("Scale3"); + const Tensor* bias3 = ctx.Input("Bias3"); + auto bias3_data = bias3->data(); + auto scale3_data = scale3->data(); + + auto bn3_output_data = RAII_GUARD.alloc(attr.conv3_output_numel); + PADDLE_ENFORCE_XDNN_NOT_NULL(bn3_output_data); + + if (!attr.global_stats) { + Tensor* saved_mean3 = ctx.Output("SavedMean3"); + Tensor* saved_invstd3 = ctx.Output("SavedInvstd3"); + Tensor* running_mean3 = ctx.Output("Mean3Out"); + Tensor* running_var3 = ctx.Output("Var3Out"); + + auto saved_mean3_data = saved_mean3->mutable_data(place); + auto saved_invstd3_data = saved_invstd3->mutable_data(place); + auto running_mean3_data = running_mean3->mutable_data(place); + auto running_var3_data = running_var3->mutable_data(place); + + r = xpu::batch_norm_fusion(dev_ctx.x_context(), + conv3_output_data, + bn3_output_data, + attr.conv3_output_shape[0], + attr.conv3_output_shape[1], + attr.conv3_output_shape[3], + attr.conv3_output_shape[3], + attr.eps, + attr.momentum, + scale3_data, + bias3_data, + saved_mean3_data, + saved_invstd3_data, + running_mean3_data, + running_var3_data, + true, + nullptr, + xpu::Activation_t::LINEAR, + nullptr, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_fusion"); + } else { + const auto* mean3 = ctx.Input("Mean3"); + const auto* var3 = ctx.Input("Var3"); + const auto* mean3_data = mean3->data(); + const auto* variance3_data = var3->data(); + r = xpu::batch_norm_infer(dev_ctx.x_context(), + conv3_output_data, + bn3_output_data, + attr.conv3_output_shape[0], + attr.conv3_output_shape[1], + attr.conv3_output_shape[2], + attr.conv3_output_shape[3], + attr.eps, + scale3_data, + bias3_data, + mean3_data, + variance3_data, + true); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_infer"); + } + z_out_data = reinterpret_cast(bn3_output_data); + } else { + z_out_data = x_data; + } + + // 2. conv1 + XPUT* conv1_input_l3_data = nullptr; + XPUT* conv1_filter_l3_data = + RAII_GUARD.alloc_l3(attr.conv1_filter_numel); + if (attr.find_max) { + r = xpu::findmax_copy_fusion(dev_ctx.x_context(), + x_data, + conv1_input_max_data, + conv1_input_l3_data, + attr.conv1_input_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion"); + + r = xpu::findmax_copy_fusion(dev_ctx.x_context(), + conv1_filter_data, + conv1_filter_max_data, + conv1_filter_l3_data, + attr.conv1_filter_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion"); + } + xpu_conv2d(dev_ctx.x_context(), + conv1_input_l3_data != nullptr ? conv1_input_l3_data : x_data, + conv1_filter_l3_data, + conv1_output_data, + conv1_input_max_data, + conv1_filter_max_data, + attr.conv1_input_shape, + attr.conv1_filter_shape, + attr.padding1, + attr.stride1, + attr.dilation1, + attr.group); + + // 3. bn1 + relu + if (!attr.global_stats) { + Tensor* saved_mean1 = ctx.Output("SavedMean1"); + Tensor* saved_invstd1 = ctx.Output("SavedInvstd1"); + Tensor* running_mean1 = ctx.Output("Mean1Out"); + Tensor* running_var1 = ctx.Output("Var1Out"); + + auto saved_mean1_data = saved_mean1->mutable_data(place); + auto saved_invstd1_data = saved_invstd1->mutable_data(place); + auto running_mean1_data = running_mean1->mutable_data(place); + auto running_var1_data = running_var1->mutable_data(place); + + r = xpu::batch_norm_fusion(dev_ctx.x_context(), + conv1_output_data, + conv2_input_data, + attr.conv1_output_shape[0], + attr.conv1_output_shape[1], + attr.conv1_output_shape[2], + attr.conv1_output_shape[3], + attr.eps, + attr.momentum, + scale1_data, + bias1_data, + saved_mean1_data, + saved_invstd1_data, + running_mean1_data, + running_var1_data, + true, + nullptr, + xpu::Activation_t::RELU, + nullptr, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_fusion"); + } else { + // bn --> relu + auto bn1_output_data = RAII_GUARD.alloc(attr.conv1_output_numel); + PADDLE_ENFORCE_XDNN_NOT_NULL(bn1_output_data); + + const auto* mean1 = ctx.Input("Mean1"); + const auto* var1 = ctx.Input("Var1"); + const auto* mean_data = mean1->data(); + const auto* variance_data = var1->data(); + r = xpu::batch_norm_infer(dev_ctx.x_context(), + conv1_output_data, + bn1_output_data, + attr.conv1_output_shape[0], + attr.conv1_output_shape[1], + attr.conv1_output_shape[2], + attr.conv1_output_shape[3], + attr.eps, + scale1_data, + bias1_data, + mean_data, + variance_data, + true); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_infer"); + + r = xpu::relu(dev_ctx.x_context(), + bn1_output_data, + conv2_input_data, + attr.conv1_output_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu"); + } + + // 4. conv2 + XPUT* conv2_input_l3_data = nullptr; + XPUT* conv2_filter_l3_data = + RAII_GUARD.alloc_l3(attr.conv2_filter_numel); + if (attr.find_max) { + Tensor* max_input2 = ctx.Output("MaxInput2"); + Tensor* max_filter2 = ctx.Output("MaxFilter2"); + conv2_input_max_data = max_input2->mutable_data(place); + conv2_filter_max_data = max_filter2->mutable_data(place); + + r = xpu::findmax_copy_fusion(dev_ctx.x_context(), + conv2_input_data, + conv2_input_max_data, + conv2_input_l3_data, + attr.conv2_input_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion"); + + r = xpu::findmax_copy_fusion(dev_ctx.x_context(), + conv2_filter_data, + conv2_filter_max_data, + conv2_filter_l3_data, + attr.conv2_filter_numel); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax_copy_fusion"); + } + xpu_conv2d( + dev_ctx.x_context(), + conv2_input_l3_data != nullptr ? conv2_input_l3_data : conv2_input_data, + conv2_filter_l3_data, + conv2_output_data, + conv2_input_max_data, + conv2_filter_max_data, + attr.conv2_input_shape, + attr.conv2_filter_shape, + attr.padding2, + attr.stride2, + attr.dilation2, + attr.group); + + // 5. bn2 + if (!attr.global_stats) { + Tensor* saved_mean2 = ctx.Output("SavedMean2"); + Tensor* saved_var2 = ctx.Output("SavedInvstd2"); + Tensor* running_mean2 = ctx.Output("Mean2Out"); + Tensor* running_var2 = ctx.Output("Var2Out"); + + auto saved_mean2_data = saved_mean2->mutable_data(place); + auto saved_var2_data = saved_var2->mutable_data(place); + auto running_mean2_data = running_mean2->mutable_data(place); + auto running_var2_data = running_var2->mutable_data(place); + + r = xpu::batch_norm_fusion(dev_ctx.x_context(), + conv2_output_data, + output_data, + attr.conv2_output_shape[0], + attr.conv2_output_shape[1], + attr.conv2_output_shape[2], + attr.conv2_output_shape[3], + attr.eps, + attr.momentum, + scale2_data, + bias2_data, + saved_mean2_data, + saved_var2_data, + running_mean2_data, + running_var2_data, + true, + z_out_data, + xpu::Activation_t::RELU, + nullptr, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_fusion"); + } else { + auto bn2_out_data = RAII_GUARD.alloc(attr.conv2_output_numel); + PADDLE_ENFORCE_XDNN_NOT_NULL(bn2_out_data); + + const auto* mean2 = ctx.Input("Mean2"); + const auto* var2 = ctx.Input("Var2"); + const auto* mean_data = mean2->data(); + const auto* variance_data = var2->data(); + r = xpu::batch_norm_infer(dev_ctx.x_context(), + conv2_output_data, + bn2_out_data, + attr.conv2_output_shape[0], + attr.conv2_output_shape[1], + attr.conv2_output_shape[2], + attr.conv2_output_shape[3], + attr.eps, + scale2_data, + bias2_data, + mean_data, + variance_data, + true); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_infer"); + + r = xpu::add_activation_fusion(dev_ctx.x_context(), + bn2_out_data, + z_out_data, + output_data, + output->numel(), + nullptr, + nullptr, + nullptr, + xpu::Activation_t::RELU); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add_activation_fusion"); + } + } +}; + +template +class ResNetBasicBlockGradXPUKernel : public framework::OpKernel { + public: + using XPUT = typename XPUTypeTrait::Type; + + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_xpu_place(ctx.GetPlace()), + true, + platform::errors::PreconditionNotMet("It must use XPUPlace.")); + + const Tensor* y_grad = ctx.Input(framework::GradVarName("Y")); + const Tensor* y = ctx.Input("Y"); + + const Tensor* x = ctx.Input("X"); + const Tensor* filter1 = ctx.Input("Filter1"); + const Tensor* scale1 = ctx.Input("Scale1"); + const Tensor* filter2 = ctx.Input("Filter2"); + const Tensor* scale2 = ctx.Input("Scale2"); + const Tensor* saved_mean1 = ctx.Input("SavedMean1"); + const Tensor* saved_invstd1 = ctx.Input("SavedInvstd1"); + const Tensor* saved_mean2 = ctx.Input("SavedMean2"); + const Tensor* saved_invstd2 = ctx.Input("SavedInvstd2"); + const Tensor* conv1_out = ctx.Input("Conv1"); + const Tensor* conv2_out = ctx.Input("Conv2"); + const Tensor* conv2_input = ctx.Input("Conv2Input"); + + const Tensor* filter3 = ctx.Input("Filter3"); + const Tensor* conv3_out = ctx.Input("Conv3"); + const Tensor* scale3 = ctx.Input("Scale3"); + const Tensor* saved_mean3 = ctx.Input("SavedMean3"); + const Tensor* saved_invstd3 = ctx.Input("SavedInvstd3"); + + const Tensor* conv1_input_max = ctx.Input("MaxInput1"); + const Tensor* conv1_filter_max = ctx.Input("MaxFilter1"); + const Tensor* conv2_input_max = ctx.Input("MaxInput2"); + const Tensor* conv2_filter_max = ctx.Input("MaxFilter2"); + const Tensor* conv3_input_max = ctx.Input("MaxInput3"); + const Tensor* conv3_filter_max = ctx.Input("MaxFilter3"); + + Tensor* x_grad = ctx.Output(framework::GradVarName("X")); + Tensor* filter1_grad = + ctx.Output(framework::GradVarName("Filter1")); + Tensor* scale1_grad = ctx.Output(framework::GradVarName("Scale1")); + Tensor* bias1_grad = ctx.Output(framework::GradVarName("Bias1")); + Tensor* filter2_grad = + ctx.Output(framework::GradVarName("Filter2")); + Tensor* scale2_grad = ctx.Output(framework::GradVarName("Scale2")); + Tensor* bias2_grad = ctx.Output(framework::GradVarName("Bias2")); + Tensor* filter3_grad = + ctx.Output(framework::GradVarName("Filter3")); + Tensor* scale3_grad = ctx.Output(framework::GradVarName("Scale3")); + Tensor* bias3_grad = ctx.Output(framework::GradVarName("Bias3")); + + // attrs + ResnetBasicBlockGradAttr attr(ctx); + auto place = ctx.GetPlace(); + + const auto* y_grad_data = reinterpret_cast(y_grad->data()); + const auto* y_data = reinterpret_cast(y->data()); + const auto* x_data = reinterpret_cast(x->data()); + const auto* conv1_output_data = + reinterpret_cast(conv1_out->data()); + const auto* conv1_filter_data = + reinterpret_cast(filter1->data()); + const auto* conv2_input_data = + reinterpret_cast(conv2_input->data()); + const auto* conv2_output_data = + reinterpret_cast(conv2_out->data()); + const auto* conv2_filter_data = + reinterpret_cast(filter2->data()); + + const auto* scale2_data = scale2->data(); + const auto* saved_mean2_data = saved_mean2->data(); + const auto* saved_invstd2_data = saved_invstd2->data(); + const auto* scale1_data = scale1->data(); + const auto* saved_mean1_data = saved_mean1->data(); + const auto* saved_invstd1_data = saved_invstd1->data(); + auto* scale2_grad_data = scale2_grad->mutable_data(place); + auto* bias2_grad_data = bias2_grad->mutable_data(place); + + const float* conv1_input_max_data = nullptr; + const float* conv1_filter_max_data = nullptr; + const float* conv2_input_max_data = nullptr; + const float* conv2_filter_max_data = nullptr; + const float* conv3_input_max_data = nullptr; + const float* conv3_filter_max_data = nullptr; + if (attr.find_max) { + conv1_input_max_data = + reinterpret_cast(conv1_input_max->data()); + conv1_filter_max_data = + reinterpret_cast(conv1_filter_max->data()); + conv2_input_max_data = + reinterpret_cast(conv2_input_max->data()); + conv2_filter_max_data = + reinterpret_cast(conv2_filter_max->data()); + if (attr.has_shortcut) { + conv3_input_max_data = + reinterpret_cast(conv3_input_max->data()); + conv3_filter_max_data = + reinterpret_cast(conv3_filter_max->data()); + } + } + + auto& dev_ctx = ctx.template device_context(); + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + int r = XPU_SUCCESS; + + // 0. bn2, bn2_fusion grad + auto conv2_output_grad_data = + RAII_GUARD.alloc(attr.conv2_output_numel); + PADDLE_ENFORCE_XDNN_NOT_NULL(conv2_output_grad_data); + + XPUT* z_output_grad_data = nullptr; + XPUT* z_grad_data = nullptr; + if (!attr.has_shortcut) { + z_output_grad_data = RAII_GUARD.alloc(attr.conv1_input_numel); + PADDLE_ENFORCE_XDNN_NOT_NULL(z_output_grad_data); + z_grad_data = z_output_grad_data; + } else { + z_output_grad_data = RAII_GUARD.alloc(attr.conv3_output_numel); + PADDLE_ENFORCE_XDNN_NOT_NULL(z_output_grad_data); + + z_grad_data = RAII_GUARD.alloc(attr.conv1_input_numel); + PADDLE_ENFORCE_XDNN_NOT_NULL(z_grad_data); + } + + r = xpu::batch_norm_grad_fusion(dev_ctx.x_context(), + conv2_output_data, + y_data, + y_grad_data, + conv2_output_grad_data, + attr.conv2_output_shape[0], + attr.conv2_output_shape[1], + attr.conv2_output_shape[2], + attr.conv2_output_shape[3], + scale2_data, + saved_mean2_data, + saved_invstd2_data, + scale2_grad_data, + bias2_grad_data, + true, + z_output_grad_data, + xpu::Activation_t::RELU, + nullptr, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_grad_fusion"); + + if (attr.has_shortcut) { + // bn3 grad + const auto* conv3_output_data = + reinterpret_cast(conv3_out->data()); + const auto* scale3_data = scale3->data(); + const auto* saved_mean3_data = saved_mean3->data(); + const auto* saved_invstd3_data = saved_invstd3->data(); + auto* scale3_grad_data = scale3_grad->mutable_data(place); + auto* bias3_grad_data = bias3_grad->mutable_data(place); + auto* conv3_output_grad_data = + RAII_GUARD.alloc(attr.conv3_output_numel); + + r = xpu::batch_norm_grad(dev_ctx.x_context(), + conv3_output_data, + z_output_grad_data, + conv3_output_grad_data, + attr.conv3_output_shape[0], + attr.conv3_output_shape[1], + attr.conv3_output_shape[2], + attr.conv3_output_shape[3], + scale3_data, + saved_mean3_data, + saved_invstd3_data, + scale3_grad_data, + bias3_grad_data, + true); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_grad"); + + // conv3 grad + auto* conv3_filter_grad_data = + reinterpret_cast(filter3_grad->mutable_data(place)); + auto* conv3_filter_data = + reinterpret_cast(filter3->data()); + xpu_conv2d_grad(dev_ctx.x_context(), + x_data, + conv3_filter_data, + conv3_output_grad_data, + z_grad_data, + conv3_filter_grad_data, + conv3_input_max_data, + conv3_filter_max_data, + attr.conv3_input_shape, + attr.conv3_filter_shape, + attr.padding3, + attr.stride3, + attr.dilation3, + attr.group); + } + + // 2. conv2_grad + auto* conv2_filter_grad_data = + reinterpret_cast(filter2_grad->mutable_data(place)); + auto* conv2_input_grad_data = + RAII_GUARD.alloc(attr.conv2_input_numel); + xpu_conv2d_grad(dev_ctx.x_context(), + conv2_input_data, + conv2_filter_data, + conv2_output_grad_data, + conv2_input_grad_data, + conv2_filter_grad_data, + conv2_input_max_data, + conv2_filter_max_data, + attr.conv2_input_shape, + attr.conv2_filter_shape, + attr.padding2, + attr.stride2, + attr.dilation2, + attr.group); + + // 3. b1 grad + auto* conv1_output_grad_data = + RAII_GUARD.alloc(attr.conv1_output_numel); + PADDLE_ENFORCE_XDNN_NOT_NULL(conv1_output_grad_data); + auto* scale1_grad_data = scale1_grad->mutable_data(ctx.GetPlace()); + auto* bias1_grad_data = bias1_grad->mutable_data(ctx.GetPlace()); + r = xpu::batch_norm_grad_fusion(dev_ctx.x_context(), + conv1_output_data, + conv2_input_data, + conv2_input_grad_data, + conv1_output_grad_data, + attr.conv1_output_shape[0], + attr.conv1_output_shape[1], + attr.conv1_output_shape[2], + attr.conv1_output_shape[3], + scale1_data, + saved_mean1_data, + saved_invstd1_data, + scale1_grad_data, + bias1_grad_data, + true, + nullptr, + xpu::Activation_t::RELU, + nullptr, + 0); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "batch_norm_grad_fusion"); + + // 4. conv1_grad + auto* x_grad_data = reinterpret_cast(x_grad->mutable_data(place)); + auto* conv1_filter_grad_data = + reinterpret_cast(filter1_grad->mutable_data(place)); + xpu_conv2d_grad(dev_ctx.x_context(), + x_data, + conv1_filter_data, + conv1_output_grad_data, + x_grad_data, + conv1_filter_grad_data, + conv1_input_max_data, + conv1_filter_max_data, + attr.conv1_input_shape, + attr.conv1_filter_shape, + attr.padding1, + attr.stride1, + attr.dilation1, + attr.group); + + // add z_grad to x_grad + r = xpu::add( + dev_ctx.x_context(), x_grad_data, z_grad_data, x_grad_data, x->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_XPU_KERNEL( + resnet_basic_block, + ops::ResNetBasicBlockXPUKernel, + ops::ResNetBasicBlockXPUKernel); +REGISTER_OP_XPU_KERNEL( + resnet_basic_block_grad, + ops::ResNetBasicBlockGradXPUKernel, + ops::ResNetBasicBlockGradXPUKernel); +#endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 2b80396cc3138..204cb0015048d 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -505,6 +505,14 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"sequence_conv_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + + // Fused op + {"resnet_basic_block_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, + {"resnet_basic_block", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, }; return s_xpu2_kernels; From 7be637a7ad1f584d9c8858eab68ba128a5b6d09a Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Fri, 8 Jul 2022 14:06:42 +0800 Subject: [PATCH 102/250] [Paddle-TRT] fix_pool (#44139) * fix_pool * fix pool_op_plugin --- paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu index 69c317781ef57..21eb89d135efa 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu @@ -184,11 +184,6 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions( platform::errors::InvalidArgument( "The Split plugin should be only one input.")); - PADDLE_ENFORCE_EQ( - inputs[0].d[1]->isConstant(), - true, - platform::errors::InvalidArgument("The channel dimension should be " - "static, but we found it's dynamic.")); nvinfer1::DimsExprs output(inputs[0]); if (is_global_ && !adaptive_) { output.d[2] = expr_builder.constant(1); From 19902a1291c3f07f9671324ee9e6b42f56f64fbc Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Fri, 8 Jul 2022 14:16:03 +0800 Subject: [PATCH 103/250] unsqueeze2 support fp16. test=kunlun (#44142) --- paddle/fluid/platform/device/xpu/xpu2_op_list.h | 6 ++++-- .../fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 204cb0015048d..2fa287b80f451 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -482,7 +482,8 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::UINT8, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, + pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"unsqueeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace()), @@ -490,7 +491,8 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::UINT8, XPUPlace()), - pOpKernelType(vartype::FP32, XPUPlace())})}, + pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP16, XPUPlace())})}, {"where_index", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py index e9fc66ca4fcce..8ba7f6818882a 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py @@ -69,7 +69,7 @@ def test_check_output(self): def test_check_grad(self): place = paddle.XPUPlace(0) - if self.dtype in [np.float32, np.float64]: + if self.dtype in [np.float32, np.float64, np.float16]: self.check_grad_with_place(place, ['X'], 'Out') elif self.dtype == np.bool_: return @@ -147,7 +147,7 @@ def test_check_output(self): def test_check_grad(self): place = paddle.XPUPlace(0) - if self.dtype in [np.float32, np.float64]: + if self.dtype in [np.float32, np.float64, np.float16]: self.check_grad_with_place(place, ['X'], 'Out') else: return @@ -217,7 +217,7 @@ def test_check_output(self): def test_check_grad(self): place = paddle.XPUPlace(0) - if self.dtype in [np.float32, np.float64]: + if self.dtype in [np.float32, np.float64, np.float16]: self.check_grad_with_place(place, ['X'], 'Out') else: return From 61de8af873e9409ded511cc95a1abdb84968a6c0 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Fri, 8 Jul 2022 15:36:03 +0800 Subject: [PATCH 104/250] fix tuple input for _conv_nd (#44108) --- python/paddle/nn/functional/conv.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index d8dc68376d163..3e73cab8f2cd8 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -130,6 +130,10 @@ def _conv_nd(x, if bias is not None: channel_dim = channel_dim + len( x.shape) if channel_dim < 0 else channel_dim + if isinstance(x, tuple): + x = x[0] + if isinstance(bias, tuple): + bias = bias[0] if len(bias.shape) < len(x.shape): tmp_bias = _C_ops.final_state_reshape( bias, bias.shape + From 21ae549ea77d9de256449e09e6feaeebc7a6d6f6 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Fri, 8 Jul 2022 16:49:46 +0800 Subject: [PATCH 105/250] Fix Argmax Layout autotune (#44080) --- paddle/fluid/imperative/layout_transformer.h | 5 +++-- python/paddle/fluid/tests/unittests/test_layout_autotune.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h index 50d3e2b6ac139..ab7619dedb2e9 100644 --- a/paddle/fluid/imperative/layout_transformer.h +++ b/paddle/fluid/imperative/layout_transformer.h @@ -374,8 +374,9 @@ class ArgmaxOpTransformer bool keep_dims = BOOST_GET_CONST(bool, (*attrs)["keepdims"]); if (keep_dims) { if (var_layout != DataLayout::UNDEFINED) { - std::vector perm_nhwc = {0, 2, 3, 1}; - std::vector perm_nchw = {0, 3, 1, 2}; + std::vector perm_nhwc = {0, 3, 1, 2}; + std::vector perm_nchw = {0, 2, 3, 1}; + auto perm = var_layout == DataLayout::NHWC ? perm_nhwc : perm_nchw; switch (AttrTypeID((*attrs)["axis"])) { case paddle::framework::proto::AttrType::INT: { diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py index 6e25e3719d3cd..fc9b51c5fc040 100644 --- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py +++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py @@ -146,7 +146,7 @@ def test_argmax_op_transposer_keep_dims(self): out = paddle.argmax(conv_out, axis=1, keepdim=True) self.assertEqual(conv_out.shape, [1, 14, 12, 8]) - self.assertEqual(out.shape, [1, 14, 1, 8]) + self.assertEqual(out.shape, [1, 14, 12, 1]) def test_argmax_op_transposer(self): if not self.use_autoune(): From 2fc93f390c2d69f5d0d0ca679484bc2e46a20426 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Fri, 8 Jul 2022 17:14:05 +0800 Subject: [PATCH 106/250] [JitLayer]Pybind JitLayer VarBase Function and add python UT (#44010) * Pybind JitLayer VarBase Function and add python UT * Add multi program load UT * Fix UT place error * Update jit.save param name * Remove some comments * Polish cmakelists * Polish JitLayer in Python * Fix comments --- paddle/fluid/jit/CMakeLists.txt | 10 +-- paddle/fluid/jit/compilation_unit.cc | 20 ++++- paddle/fluid/jit/compilation_unit.h | 8 +- paddle/fluid/jit/executor_function.h | 2 + paddle/fluid/jit/layer.cc | 8 ++ paddle/fluid/jit/layer.h | 4 + paddle/fluid/pybind/CMakeLists.txt | 6 +- paddle/fluid/pybind/jit.cc | 83 +++++++++++++++++++ paddle/fluid/pybind/jit.h | 27 ++++++ paddle/fluid/pybind/pybind.cc | 2 + .../fluid/tests/unittests/test_jit_layer.py | 82 ++++++++++++++++++ python/paddle/jit/layer.py | 51 ++++++++++++ 12 files changed, 289 insertions(+), 14 deletions(-) create mode 100644 paddle/fluid/pybind/jit.cc create mode 100644 paddle/fluid/pybind/jit.h create mode 100644 python/paddle/fluid/tests/unittests/test_jit_layer.py create mode 100644 python/paddle/jit/layer.py diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt index 9c96d8e986a22..75483ac6544f4 100644 --- a/paddle/fluid/jit/CMakeLists.txt +++ b/paddle/fluid/jit/CMakeLists.txt @@ -26,7 +26,8 @@ cc_library( cc_library( jit_layer SRCS layer.cc - DEPS jit_compilation_unit) + DEPS jit_serializer jit_function_utils jit_serializer_utils + jit_compilation_unit jit_function_schema) if(WITH_TESTING AND NOT WIN32 @@ -45,12 +46,7 @@ if(WITH_TESTING feed_op fetch_op scale_op - jit_serializer - jit_layer - jit_function_utils - jit_function_schema - jit_compilation_unit - jit_serializer_utils) + jit_layer) cc_test( layer_test SRCS layer_test.cc diff --git a/paddle/fluid/jit/compilation_unit.cc b/paddle/fluid/jit/compilation_unit.cc index 261839b479e5b..d62c497d8b338 100644 --- a/paddle/fluid/jit/compilation_unit.cc +++ b/paddle/fluid/jit/compilation_unit.cc @@ -22,16 +22,28 @@ namespace jit { std::shared_ptr CompilationUnit::Function( const std::string &name) const { PADDLE_ENFORCE_EQ( - function_dict_.count(name), + function_map_.count(name), 1, platform::errors::InvalidArgument( - "Funciton name %s is not exist in function_dict_.", name)); - return function_dict_.at(name); + "Funciton name %s is not exist in function_map_.", name)); + return function_map_.at(name); } void CompilationUnit::SetFunction( const std::string &name, const std::shared_ptr &function) { - function_dict_[name] = function; + function_map_[name] = function; +} + +std::vector CompilationUnit::FunctionNames() const { + std::vector names; + for (auto it = function_map_.begin(); it != function_map_.end(); it++) { + names.emplace_back(it->first); + } + return names; +} + +const Name2FunctionMap &CompilationUnit::FunctionMap() const { + return function_map_; } } // namespace jit diff --git a/paddle/fluid/jit/compilation_unit.h b/paddle/fluid/jit/compilation_unit.h index 2944aa928f32f..45a771b649401 100644 --- a/paddle/fluid/jit/compilation_unit.h +++ b/paddle/fluid/jit/compilation_unit.h @@ -21,6 +21,8 @@ namespace paddle { namespace jit { +using Name2FunctionMap = + std::unordered_map>; class CompilationUnit { public: @@ -32,8 +34,12 @@ class CompilationUnit { void SetFunction(const std::string &name, const std::shared_ptr &function); + std::vector FunctionNames() const; + + const Name2FunctionMap &FunctionMap() const; + private: - std::unordered_map> function_dict_; + Name2FunctionMap function_map_; }; } // namespace jit diff --git a/paddle/fluid/jit/executor_function.h b/paddle/fluid/jit/executor_function.h index 224798b7dbb2b..36cb438e34cc2 100644 --- a/paddle/fluid/jit/executor_function.h +++ b/paddle/fluid/jit/executor_function.h @@ -56,6 +56,8 @@ class ExecutorFunction : public BaseFunction { return res; } + const std::shared_ptr &Info() const { return info_; } + private: std::shared_ptr info_; framework::Scope scope_; diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc index a11101d520493..6662abd17d2cf 100644 --- a/paddle/fluid/jit/layer.cc +++ b/paddle/fluid/jit/layer.cc @@ -42,5 +42,13 @@ void Layer::SetFunction(const std::string& name, unit_.SetFunction(name, function); } +std::vector Layer::FunctionNames() const { + return unit_.FunctionNames(); +} + +const Name2FunctionMap& Layer::FunctionMap() const { + return unit_.FunctionMap(); +} + } // namespace jit } // namespace paddle diff --git a/paddle/fluid/jit/layer.h b/paddle/fluid/jit/layer.h index 1407259d14444..5c9f61b0d47b3 100644 --- a/paddle/fluid/jit/layer.h +++ b/paddle/fluid/jit/layer.h @@ -50,6 +50,10 @@ class Layer { void SetFunction(const std::string& name, const std::shared_ptr& function); + std::vector FunctionNames() const; + + const Name2FunctionMap& FunctionMap() const; + private: // internal::Object obj_; Name2VariableMap params_dict_; diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 2b7e12499976e..b2ecf36c5d227 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -38,7 +38,8 @@ set(PYBIND_DEPS global_utils phi_utils tcp_store - new_profiler) + new_profiler + jit_layer) if(WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) @@ -121,7 +122,8 @@ set(PYBIND_SRCS io.cc generator_py.cc communication.cc - cuda_streams_py.cc) + cuda_streams_py.cc + jit.cc) if(WITH_CUSTOM_DEVICE) set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi) diff --git a/paddle/fluid/pybind/jit.cc b/paddle/fluid/pybind/jit.cc new file mode 100644 index 0000000000000..07b79742f002e --- /dev/null +++ b/paddle/fluid/pybind/jit.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/jit.h" + +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/platform/place.h" + +#include "paddle/fluid/jit/executor_function.h" +#include "paddle/fluid/jit/function_schema.h" +#include "paddle/fluid/jit/layer.h" +#include "paddle/fluid/jit/serializer.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +using Variable = paddle::framework::Variable; + +void BindJit(pybind11::module *m) { + py::class_(*m, "Layer", R"DOC(Layer Class.)DOC") + .def("function_dict", &jit::Layer::FunctionMap); + + py::class_>( + *m, "ExectorFunction", R"DOC(ExectorFunction Class.)DOC") + .def("__call__", + [](jit::ExecutorFunction &self, + const std::vector> + &tensor_inputs) { + std::vector var_inputs; + for (auto &tensor : tensor_inputs) { + var_inputs.emplace_back(tensor->Var()); + } + auto var_outputs = self(var_inputs); + + std::vector> tensor_outputs; + auto output_names = self.Info()->OutputArgNames(); + for (size_t i = 0; i < var_outputs.size(); ++i) { + auto var = var_outputs[i]; + std::string name = output_names[i]; + imperative::VariableWrapper var_wrapper(name, var); + auto shared_wrapper = + std::make_shared(var_wrapper); + auto shared_varbase = + std::make_shared(shared_wrapper); + tensor_outputs.emplace_back(shared_varbase); + } + return tensor_outputs; + }) + .def("info", &jit::ExecutorFunction::Info); + + py::class_>( + *m, "FunctionInfo", R"DOC(FunctionInfo Class.)DOC") + .def("name", &jit::FunctionInfo::FunctionName) + .def("input_names", &jit::FunctionInfo::InputArgNames) + .def("output_names", &jit::FunctionInfo::OutputArgNames); + + m->def("Load", + [](const std::string &path, const platform::CPUPlace &cpu_place) { + return paddle::jit::Load(path, cpu_place); + }); + + m->def("Load", + [](const std::string &path, const platform::CUDAPlace &cuda_place) { + return paddle::jit::Load(path, cuda_place); + }); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/jit.h b/paddle/fluid/pybind/jit.h new file mode 100644 index 0000000000000..897e22e8b8594 --- /dev/null +++ b/paddle/fluid/pybind/jit.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include + +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace paddle { +namespace pybind { + +void BindJit(pybind11::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index abbcacec3858e..62f0402bedc7a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -90,6 +90,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/io.h" +#include "paddle/fluid/pybind/jit.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/lod_utils.h" #include "paddle/utils/none.h" @@ -563,6 +564,7 @@ PYBIND11_MODULE(core_noavx, m) { BindEager(&m); BindEagerStringTensor(&m); BindCudaStream(&m); + BindJit(&m); // Not used, just make sure cpu_info.cc is linked. paddle::platform::CpuTotalPhysicalMemory(); diff --git a/python/paddle/fluid/tests/unittests/test_jit_layer.py b/python/paddle/fluid/tests/unittests/test_jit_layer.py new file mode 100644 index 0000000000000..24c0131fd7012 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_jit_layer.py @@ -0,0 +1,82 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import paddle +import unittest +import tempfile +import numpy as np +from paddle.static import InputSpec +from paddle.fluid.framework import _enable_legacy_dygraph +from paddle.jit.layer import Layer +from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator + +_enable_legacy_dygraph() +paddle.seed(1) + + +class Net(paddle.nn.Layer): + + def __init__(self): + super(Net, self).__init__() + self.fc1 = paddle.nn.Linear(4, 4) + self.fc2 = paddle.nn.Linear(4, 4) + self._bias = 0.4 + + @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')]) + def forward(self, x): + out = self.fc1(x) + out = self.fc2(out) + out = paddle.nn.functional.relu(out) + out = paddle.mean(out) + return out + + @paddle.jit.to_static(input_spec=[InputSpec([None, 4], dtype='float32')]) + def infer(self, input): + out = self.fc2(input) + out = out + self._bias + out = paddle.mean(out) + return out + + +class TestMultiLoad(unittest.TestCase): + + def test_multi_load(self): + self.temp_dir = tempfile.TemporaryDirectory() + + x = paddle.full([2, 4], 2) + model = Net() + program_translator = ProgramTranslator() + program_translator.enable(False) + forward_out1 = model.forward(x) + infer_out1 = model.infer(x) + program_translator.enable(True) + + model_path = os.path.join(self.temp_dir.name, 'multi_program') + paddle.jit.save(model, model_path, combine_params=True) + place = paddle.CPUPlace() + if paddle.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + jit_layer = Layer() + jit_layer.load(model_path, place) + forward_out2 = jit_layer.forward(x) + infer_out2 = jit_layer.infer(x) + self.assertEqual(np.allclose(forward_out1, forward_out2[0]), True) + self.assertEqual(np.allclose(infer_out1, infer_out2[0]), True) + + self.temp_dir.cleanup() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/jit/layer.py b/python/paddle/jit/layer.py new file mode 100644 index 0000000000000..8ee3652dca843 --- /dev/null +++ b/python/paddle/jit/layer.py @@ -0,0 +1,51 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.core import Load + + +class Layer(object): + + def __init__(self): + self.cpp_layer = None + # {name: Function} + self.functions = {} + + def load(self, load_path, place): + self.cpp_layer = Load(load_path, place) + function_dict = self.cpp_layer.function_dict() + + for name, function in function_dict.items(): + self.functions[name] = Function(function) + setattr(self, name, self.functions[name]) + + +class Function(): + + def __init__(self, function): + self.function = function + self.info = FunctionInfo(function.info()) + + def __call__(self, *args): + return self.function(args) + + +class FunctionInfo(): + + def __init__(self, info): + self.info = info + + def name(self): + return self.info.name() From 9900b42bdf5014d283cfdd05b34f8832068f1831 Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Fri, 8 Jul 2022 17:17:23 +0800 Subject: [PATCH 107/250] conv_fusion_fp16 (#44173) --- paddle/fluid/operators/fused/conv_fusion_op.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 2ee63c9364221..121cbc909b812 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -315,9 +315,14 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { cudnnConvolutionFwdAlgo_t algo; auto handle = dev_ctx.cudnn_handle(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto dtype = platform::CudnnDataType::type; PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cudnn_conv_desc, CUDNN_DEFAULT_MATH)); + if (dtype == CUDNN_DATA_HALF) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_TENSOR_OP_MATH)); + } #if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000 if (!platform::allow_tf32_cudnn) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( @@ -414,7 +419,6 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { algo = algo_cache.GetAlgorithm( x_dims[2] * x_dims[3], search_times, 0, search_func); } else { - auto dtype = platform::CudnnDataType::type; algo = algo_cache.GetAlgorithm(x_dims, f_dims, strides, From b2c1247c6a8ec093581e35ade3314d7d779c8dae Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Fri, 8 Jul 2022 19:06:12 +0800 Subject: [PATCH 108/250] [Dy2St]Polish visit function in transformer (#44083) * Polish visit function in transformer * Call generic_visit first in visit_While/For * Remove comments * Polish utils.py, move some transformer to base_transformer --- .../dygraph_to_static/base_transformer.py | 649 +++++++++++++++++- .../break_continue_transformer.py | 2 +- .../dygraph_to_static/list_transformer.py | 2 +- .../dygraph_to_static/loop_transformer.py | 21 +- .../dygraph/dygraph_to_static/origin_info.py | 5 +- .../dygraph_to_static/return_transformer.py | 4 +- .../fluid/dygraph/dygraph_to_static/utils.py | 638 +---------------- 7 files changed, 664 insertions(+), 657 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py index 127a8e9232422..a3c2c0c69efaf 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py @@ -13,8 +13,17 @@ # limitations under the License. from paddle.utils import gast - -from paddle.fluid.dygraph.dygraph_to_static.origin_info import ORIGI_INFO +from paddle.fluid import unique_name +from paddle.fluid.dygraph.dygraph_to_static.utils import get_attribute_full_name +from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code +from paddle.fluid.dygraph.dygraph_to_static.utils import create_assign_node +from paddle.fluid.dygraph.dygraph_to_static.utils import ORIGI_INFO +from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_INDEX_PREFIX +from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_TUPLE_PREFIX +from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_TUPLE_INDEX_PREFIX +from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_VAR_LEN_PREFIX +from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_VAR_NAME_PREFIX +from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_ZIP_TO_LIST_PREFIX class BaseTransformer(gast.NodeTransformer): @@ -36,3 +45,639 @@ def visit(self, node): setattr(n, ORIGI_INFO, origin_info) return result + + +class RenameTransformer(BaseTransformer): + + def __init__(self, node): + assert isinstance( + node, gast.AST), "RenameTransformer only accepts gast.AST as input" + self.root = node + self.old_name = "" + self.new_name = "" + + def rename(self, old_name, new_name): + self.old_name = old_name + self.new_name = new_name + self.visit(self.root) + + def visit_Name(self, node): + self.generic_visit(node) + if node.id == self.old_name: + node.id = self.new_name + return node + + def visit_Attribute(self, node): + self.generic_visit(node) + attr_full_name = get_attribute_full_name(node) + if attr_full_name == self.old_name: + new_name_node = gast.parse(self.new_name).body[0].value + return new_name_node + return node + + +class NameNodeReplaceTransformer(BaseTransformer): + """ + This class replaces specified gast.Name node by replace_node. + """ + + def __init__(self, root_node, target_name, replace_node): + assert isinstance(target_name, str) + + # NOTE(liym27): + # Use gast.Name to replace gast.Name, otherwise, errors may occur. + # + # For examples: + # If using a gast.Subscript to replace gast.Name, and the original gast.Name + # is in the arguments of FunctionDef, an exception will be raised. + # + # ``` + # def func(x[i])) # x[i] can not be a argument + # # ... + # ``` + + assert isinstance(replace_node, gast.Name) + self.target_name = target_name + self.replace_node = replace_node + + self.visit(root_node) + + def visit_Name(self, node): + if node.id == self.target_name: + return self.replace_node + return node + + def visit_Nonlocal(self, node): + names = node.names + + def replace(s): + if s == self.target_name: return self.replace_node.id + return s + + node.names = list(map(replace, names)) + return node + + +class ForLoopTuplePreTransformer(BaseTransformer): + """ + ForNodeVisitor parses 3 type statements (Here var is VarBase(Tensor) or python variable): + 1). for x in range(var[*]|var.numpy()[*]) + 2). for x in var|var.numpy() + 3). for i, x in enumerate(var|var.numpy()) + + We chose these 3 types because they are easier (x can be variable name iterating in var). + However, users can write tuples in Python for loop, such as + 1). for var1, var2 in var|var.numpy() + 2). for t in enumerate(var|var.numpy()) + 2). for i, (var1, var2, va3) in enumerate(var|var.numpy()) + + To handle these case, this method will do the rewrite tuple pre-process: + 1). Non-enumerate case: for var1, var2 in var|var.numpy() will be re-written as: + for FOR_ITER_TUPLE_PREFIX_x in var | var.numpy(): + var1 = FOR_ITER_TUPLE_PREFIX_x[0] + var2 = FOR_ITER_TUPLE_PREFIX_x[1] + 2). Enumerate out tuple case: for t in enumerate(var|var.numpy) will be rewritten as: + for FOR_ITER_TUPLE_INDEX_PREFIX_x, FOR_ITER_TUPLE_PREFIX_x in enumerate(var|var.numpy): + t = (FOR_ITER_TUPLE_INDEX_PREFIX_x, FOR_ITER_TUPLE_PREFIX_x) + 3). Enumerate inner tuple case: for i, (var1, (var2, va3)) in enumerate(var|var.numpy()) will + be re-written as: + for i, FOR_ITER_TUPLE_PREFIX_x in var | var.numpy(): + var1 = FOR_ITER_TUPLE_PREFIX_x[0] + var2 = FOR_ITER_TUPLE_PREFIX_x[1][0] + var3 = FOR_ITER_TUPLE_PREFIX_x[1][1] + """ + + def __init__(self, wrapper_root): + self.wrapper_root = wrapper_root + self.root = wrapper_root.node + + def transform(self): + self.visit(self.root) + + def visit_For(self, node): + if self.is_for_enumerate_iter(node): + if isinstance(node.target, (gast.Name, gast.Attribute)): + # Out tuple case + out_tuple_name = ast_to_source_code(node.target).strip() + tuple_iter_name = unique_name.generate( + FOR_ITER_TUPLE_INDEX_PREFIX) + tuple_var_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX) + node.target = gast.Tuple(elts=[ + gast.Name(id=tuple_iter_name, + ctx=gast.Store(), + annotation=None, + type_comment=None), + gast.Name(id=tuple_var_name, + ctx=gast.Store(), + annotation=None, + type_comment=None) + ], + ctx=gast.Store()) + node.body.insert( + 0, + gast.Assign(targets=[ + gast.Name(id=out_tuple_name, + ctx=gast.Store(), + annotation=None, + type_comment=None) + ], + value=gast.Tuple(elts=[ + gast.Name(id=tuple_iter_name, + ctx=gast.Load(), + annotation=None, + type_comment=None), + gast.Name(id=tuple_var_name, + ctx=gast.Load(), + annotation=None, + type_comment=None) + ], + ctx=gast.Load()))) + elif isinstance(node.target, (gast.List, gast.Tuple)) and len( + node.target.elts) >= 2 and isinstance( + node.target.elts[1], (gast.List, gast.Tuple)): + # Inner tuple case + inner_tuple_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX) + origin_inner_tuple_node = node.target.elts[1] + node.target.elts[1] = gast.Name(id=inner_tuple_name, + ctx=gast.Store(), + annotation=None, + type_comment=None) + node.body[0:0] = self.tuple_to_stmts(origin_inner_tuple_node, + inner_tuple_name) + elif self.is_for_iter(node) and isinstance(node.target, + (gast.List, gast.Tuple)): + # Non-enumrate case: + tuple_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX) + origin_tuple_node = node.target + node.target = gast.Name(id=tuple_name, + ctx=gast.Store(), + annotation=None, + type_comment=None) + node.body[0:0] = self.tuple_to_stmts(origin_tuple_node, tuple_name) + return node + + def tuple_to_stmts(self, node, tuple_name, idx=[]): + if not isinstance(node, (gast.Tuple, gast.List)): + value_node_str = tuple_name + for i in idx: + value_node_str = value_node_str + "[{}]".format(i) + + node_str = ast_to_source_code(node).strip() + assign_node_str = "{} = {}".format(node_str, value_node_str) + assign_node = gast.parse(assign_node_str).body[0] + return [assign_node] + + # isinstance(node, (gast.Tuple, gast.List)) + ret = [] + for i, element in enumerate(node.elts): + ret += self.tuple_to_stmts(node.elts[i], tuple_name, idx + [i]) + return ret + + def is_for_iter(self, for_node): + assert isinstance(for_node, + gast.For), "Input node is not gast.For node." + if isinstance(for_node.iter, (gast.Name, gast.Attribute)): + return True + elif isinstance(for_node.iter, gast.Call) and isinstance( + for_node.iter.func, + gast.Attribute) and for_node.iter.func.attr == 'numpy': + return True + elif isinstance(for_node.iter, gast.Subscript): + return True + else: + return False + + def is_for_enumerate_iter(self, for_node): + assert isinstance(for_node, + gast.For), "Input node is not gast.For node." + return isinstance(for_node.iter, gast.Call) and isinstance( + for_node.iter.func, + gast.Name) and for_node.iter.func.id == "enumerate" + + +class SplitAssignTransformer(BaseTransformer): + """ + This class transforms sequence assignments and multi-target assignments to normal assignments. + """ + + def __init__(self, ast_node): + assert isinstance(ast_node, gast.AST) + self.ast_root = ast_node + + def transform(self): + self.visit(self.ast_root) + + def visit_Assign(self, node): + target_nodes = node.targets + if len(target_nodes) == 1: + node = self._parse_sequence_assign(node) + else: + node = self._parse_multi_target_assign(node) + return node + + def _parse_sequence_assign(self, node): + """ + a, b = c, d + -> + a = c + b = d + """ + assert isinstance(node, gast.Assign) + + target_nodes = node.targets + value_node = node.value + if not isinstance(target_nodes[0], (gast.List, gast.Tuple)): + return node + if not isinstance(value_node, (gast.List, gast.Tuple)): + return node + + targets = node.targets[0].elts + values = node.value.elts + if len(targets) != len(values): + return node + + new_nodes = [] + for target, value in zip(targets, values): + assign_node = gast.Assign(targets=[target], value=value) + new_nodes.append(assign_node) + + return new_nodes + + def _parse_multi_target_assign(self, node): + """ + Example 1: + a = b = c + -> + b = c + a = b + + Example 2: + a, b = c, d = x + -> + c,d = x + a = c + b = d + """ + assert isinstance(node, gast.Assign) + + target_nodes = node.targets + value_node = node.value + new_nodes = [] + for target in reversed(target_nodes): + assign_node = gast.Assign(targets=[target], value=value_node) + # NOTE: Because assign_node can be sequence assign statement like `a,b = c,d`, + # it's necessary to visit this new assign_node + parsed_node = self.visit_Assign(assign_node) + if not isinstance(parsed_node, list): + parsed_node = [parsed_node] + + new_nodes.extend(parsed_node) + value_node = target + + return new_nodes + + +class ForNodeVisitor(object): + """ + This class parses python for statement, get transformed 3 statement components of for node + three key statements: + 1). init_stmts: list[node], prepare nodes of for loop, may not only one + 2). cond_stmt: node, condition node to judge whether continue loop + 3). body_stmts: list[node], updated loop body, sometimes we should change + the original statement in body, not just append new statement + + In this process, the semantics of for does not change. + + Now only can parse 3 type statements (Here var is VarBase(Tensor) or python variable): + 1). for x in range(var[*]|var.numpy()[*]) + 2). for x in var|var.numpy() + 3). for i, x enumerate(var|var.numpy()) + """ + + def __init__(self, for_node): + assert isinstance( + for_node, gast.For + ), "Input node for the initialization of ForNodeVisitor is not gast.For node." + # 1. original for node + self.node = for_node + + # 2. gast.For node main parts + self.target = for_node.target + # NOTE: type may be Node or list[Node] + self.iter_args = for_node.iter if self.is_for_iter( + ) else for_node.iter.args + self.body = for_node.body + + # 3. key shared node or names + # - x: + # - for x in range(***) + # - for x in var|var.numpy() + # - for i, x enumerate(var|var.numpy()) + self.iter_var_name = self._get_iter_var_name() + + # - created index var to slice Variable: __for_loop_var_index_0 + # - for x in var|var.numpy() + # - for i, x enumerate(var|var.numpy()) + self.iter_idx_name = unique_name.generate(FOR_ITER_INDEX_PREFIX) + + # - created shape var to build loop condition: __for_loop_var_len_0 + # - for x in var|var.numpy() + # - for i, x enumerate(var|var.numpy()) + # - for x in var + self.iter_var_len_name = unique_name.generate(FOR_ITER_VAR_LEN_PREFIX) + # - created zip to list var : __for_loop_iter_zip_0 + self.iter_zip_to_list_name = unique_name.generate( + FOR_ITER_ZIP_TO_LIST_PREFIX) + + # - var.numpy()/var + # - for x in var|var.numpy() + # - for i, x enumerate(var|var.numpy()) + self.iter_node = self._get_iter_node() + + # - enumeate i: + # - for i, x enumerate(var|var.numpy()) + self.enum_idx_name = self._get_enum_idx_name() + + # - range/enumerate args length + self.args_length = None + + def parse(self): + self._args_check() + if self.is_for_range_iter(): + return self._parse_for_range_stmts() + elif self.is_for_iter(): + return self._parse_for_stmts() + elif self.is_for_enumerate_iter(): + return self._parse_for_enumerate_stmts() + else: + return None + + def is_for_range_iter(self): + return isinstance(self.node.iter, gast.Call) and isinstance( + self.node.iter.func, + gast.Name) and self.node.iter.func.id == "range" + + def is_for_iter(self): + if isinstance(self.node.iter, + (gast.Name, gast.Attribute, gast.List, gast.Tuple)): + return True + elif isinstance(self.node.iter, gast.Call) and isinstance( + self.node.iter.func, + gast.Attribute) and self.node.iter.func.attr == 'numpy': + return True + elif isinstance(self.node.iter, gast.Subscript): + return True + else: + return False + + def is_for_enumerate_iter(self): + return isinstance(self.node.iter, gast.Call) and isinstance( + self.node.iter.func, + gast.Name) and self.node.iter.func.id == "enumerate" + + def _args_check(self): + if self.is_for_range_iter(): + self.args_length = len(self.iter_args) + assert self.args_length >= 1 and self.args_length <= 3, "range() function takes 1 to 3 arguments" + elif self.is_for_enumerate_iter(): + self.args_length = len(self.iter_args) + assert self.args_length >= 1 and self.args_length <= 2, "enumerate() function takes 1 to 2 arguments" + else: + self.args_length = None + + def _parse_for_range_stmts(self): + init_stmts = [] + init_stmts.append(self._build_index_init_node()) + + compare_node = self._build_compare_node() + step_node = self._build_step_node() + cond_stmt = self._build_cond_stmt(step_node, compare_node) + + body_stmts = self.body + body_stmts.append(self._build_index_increase_node(step_node)) + + return init_stmts, cond_stmt, body_stmts + + def _parse_for_stmts(self): + init_stmts = [] + init_stmts.extend(self._build_iter_node()) + init_stmts.append(self._build_index_init_node()) + init_stmts.append(self._build_var_len_assign_node()) + + compare_node = self._build_compare_node() + step_node = self._build_step_node() + cond_stmt = self._build_cond_stmt(step_node, compare_node) + + body_stmts = self.body + + # NOTE(liym27): Here add a gast.Assign, and the target of it is gast.Name. + # In NameNodeReplaceTransformer, using gast.Name to replace gast.Name is safe. + target_node, assign_node = self._build_assign_var_slice_node() + body_stmts[0:0] = [assign_node] + for body_node in body_stmts: + NameNodeReplaceTransformer(body_node, self.iter_var_name, + target_node) + body_stmts.append(self._build_index_increase_node(step_node)) + + return init_stmts, cond_stmt, body_stmts + + def _parse_for_enumerate_stmts(self): + init_stmts = [] + init_stmts.extend(self._build_iter_node()) + init_stmts.append(self._build_index_init_node()) + init_stmts.append(self._build_var_len_assign_node()) + init_stmts.append(self._build_enum_init_node()) + + compare_node = self._build_compare_node() + step_node = self._build_step_node() + cond_stmt = self._build_cond_stmt(step_node, compare_node) + + body_stmts = self.body + + target_node, assign_node = self._build_assign_var_slice_node() + body_stmts[0:0] = [assign_node] + for body_node in body_stmts: + NameNodeReplaceTransformer(body_node, self.iter_var_name, + target_node) + + body_stmts.append(self._build_index_increase_node(step_node)) + body_stmts.append(self._build_enum_increase_node()) + + return init_stmts, cond_stmt, body_stmts + + def _build_index_init_node(self): + if self.is_for_range_iter(): + if self.args_length == 1: + index_init_value_str = '0' + else: + index_init_value_str = ast_to_source_code( + self.iter_args[0]).strip() + + index_init_var_name = self.iter_var_name + else: + index_init_value_str = '0' + index_init_var_name = self.iter_idx_name + + index_init_node_source_str = "{target} = {value}".format( + target=index_init_var_name, value=index_init_value_str) + + index_init_node = gast.parse(index_init_node_source_str).body[0] + + return index_init_node + + def _build_var_len_assign_node(self): + # get the length of iterable variable + if isinstance(self.iter_node, gast.Call) and isinstance( + self.iter_node.func, + gast.Attribute) and self.iter_node.func.attr == 'numpy': + iter_var_name = ast_to_source_code( + self.iter_node.func.value).strip() + else: + iter_var_name = ast_to_source_code(self.iter_node).strip() + + convert_len_node_source_str = '{} = _jst.Len({})'.format( + self.iter_var_len_name, iter_var_name) + + convert_len_node = gast.parse(convert_len_node_source_str).body[0] + + return convert_len_node + + def _build_iter_node(self): + """ + Process special cases for iter_node inclue: + - Case 1 (for zip): + + - for i, val in enumerate(zip(x, y)) # original code: + + - __for_loop_iter_zip_0 = list(zip(x, y)) + - for i, val in enumerate(__for_loop_iter_zip_0) + """ + new_nodes = [] + if isinstance(self.iter_node, gast.Call) and isinstance( + self.iter_node.func, gast.Name): + if self.iter_node.func.id == 'zip': + iter_var_name = ast_to_source_code(self.iter_node).strip() + zip_to_list_str = "{target} = list({value})".format( + target=self.iter_zip_to_list_name, value=iter_var_name) + zip_to_list_node = gast.parse(zip_to_list_str).body[0] + new_nodes.append(zip_to_list_node) + + self.iter_node = gast.Name(id=self.iter_zip_to_list_name, + ctx=gast.Load(), + annotation=None, + type_comment=None) + + return new_nodes + + def _build_enum_init_node(self): + if self.is_for_enumerate_iter() and self.args_length != 1: + init_value_str = ast_to_source_code(self.iter_args[1]).strip() + else: + init_value_str = '0' + + enum_init_node_source_str = "{} = {}".format(self.enum_idx_name, + init_value_str) + enum_init_node = gast.parse(enum_init_node_source_str).body[0] + return enum_init_node + + def _build_compare_node(self): + if self.is_for_range_iter(): + compare_node = self.iter_args[ + 0] if self.args_length == 1 else self.iter_args[1] + else: + compare_node = gast.Name(id=self.iter_var_len_name, + ctx=gast.Load(), + annotation=None, + type_comment=None) + return compare_node + + def _build_step_node(self): + if self.is_for_range_iter(): + step_node = self.iter_args[ + 2] if self.args_length == 3 else gast.Constant(value=1, + kind=None) + else: + step_node = gast.Constant(value=1, kind=None) + return step_node + + def _build_cond_stmt(self, step_node, compare_node): + if not isinstance(step_node, (gast.Constant, gast.UnaryOp)): + raise NotImplementedError( + "Dynamic-to-Static only supports the step value is a constant or negative constant in 'for-range' statements, " + "such as '2', '-3'. But received: '{}'. Please fix code to be compatible with Dynamic-to-Static." + .format(ast_to_source_code(step_node).strip())) + + if isinstance(step_node, gast.UnaryOp) or step_node.value < 0: + # eg: + # range(max, min, -2) + # -> + # i > min + return gast.Compare(left=gast.Name( + id=self.iter_var_name + if self.is_for_range_iter() else self.iter_idx_name, + ctx=gast.Load(), + annotation=None, + type_comment=None), + ops=[gast.Gt()], + comparators=[compare_node]) + else: + # eg: + # range(min, max, 2) + # -> + # i < max + return gast.Compare(left=gast.Name( + id=self.iter_var_name + if self.is_for_range_iter() else self.iter_idx_name, + ctx=gast.Load(), + annotation=None, + type_comment=None), + ops=[gast.Lt()], + comparators=[compare_node]) + + def _build_index_increase_node(self, step_node): + return gast.AugAssign(target=gast.Name( + id=self.iter_var_name + if self.is_for_range_iter() else self.iter_idx_name, + ctx=gast.Store(), + annotation=None, + type_comment=None), + op=gast.Add(), + value=step_node) + + def _build_assign_var_slice_node(self): + var_slice_str = "{}[{}]".format( + ast_to_source_code(self.iter_node).strip(), self.iter_idx_name) + var_slice_node = gast.parse(var_slice_str).body[0].value + new_iter_var_name = unique_name.generate(FOR_ITER_VAR_NAME_PREFIX) + target_node, assign_node = create_assign_node(new_iter_var_name, + var_slice_node) + return target_node, assign_node + + def _build_enum_increase_node(self): + return gast.AugAssign(target=gast.Name(id=self.enum_idx_name, + ctx=gast.Store(), + annotation=None, + type_comment=None), + op=gast.Add(), + value=gast.Constant(value=1, kind=None)) + + def _get_iter_var_name(self): + if self.is_for_range_iter(): + return self.target.id + elif self.is_for_iter(): + return self.target.id + elif self.is_for_enumerate_iter(): + return self.target.elts[1].id + return None + + def _get_iter_node(self): + if self.is_for_iter(): + return self.iter_args + elif self.is_for_enumerate_iter(): + return self.iter_args[0] + return None + + def _get_enum_idx_name(self): + if self.is_for_enumerate_iter(): + return self.target.elts[0].id + return None diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py index 020721e85a235..b63fe6eea5af2 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py @@ -18,10 +18,10 @@ from paddle.fluid import unique_name from paddle.fluid.dygraph.dygraph_to_static.utils import index_in_list -from paddle.fluid.dygraph.dygraph_to_static.utils import ForNodeVisitor from paddle.fluid.dygraph.dygraph_to_static.utils import BaseNodeVisitor from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_bool_node from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import ForNodeVisitor __all__ = ['BreakContinueTransformer'] diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py index e29ec6c6e1d73..29e3ed5296806 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py @@ -21,8 +21,8 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code from paddle.fluid.dygraph.dygraph_to_static.utils import slice_is_num from paddle.fluid.dygraph.dygraph_to_static.utils import is_control_flow_to_transform -from paddle.fluid.dygraph.dygraph_to_static.utils import SplitAssignTransformer from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import SplitAssignTransformer class ListTransformer(BaseTransformer): diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index f04161f2c34cc..29ac905074e1d 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -25,14 +25,14 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code from paddle.fluid.dygraph.dygraph_to_static.utils import generate_name_node from paddle.fluid.dygraph.dygraph_to_static.utils import get_attribute_full_name -from paddle.fluid.dygraph.dygraph_to_static.utils import ForLoopTuplePreTransformer -from paddle.fluid.dygraph.dygraph_to_static.utils import ForNodeVisitor -from paddle.fluid.dygraph.dygraph_to_static.utils import RenameTransformer from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_nodes, create_get_args_node, create_set_args_node from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import ARGS_NAME from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import RenameTransformer +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import ForLoopTuplePreTransformer +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import ForNodeVisitor __all__ = ['LoopTransformer', 'NameVisitor'] @@ -489,14 +489,15 @@ def transform(self): self.name_visitor = NameVisitor(self.root) self.visit(self.root) - def visit(self, node): + def visit_While(self, node): self.generic_visit(node) - # All parent nodes that may contain gast.While/gast.For - if hasattr(node, 'body'): - self.replace_stmt_list(node.body) - if hasattr(node, 'orelse'): - self.replace_stmt_list(node.orelse) - return node + new_stmts = self.get_while_stmt_nodes(node) + return new_stmts + + def visit_For(self, node): + self.generic_visit(node) + new_stmts = self.get_for_stmt_nodes(node) + return new_stmts def replace_stmt_list(self, body_list): if not isinstance(body_list, list): diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py index de12677768332..93f089cf8dd9d 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py @@ -20,16 +20,13 @@ from paddle.utils import gast from paddle.fluid import core from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap +from paddle.fluid.dygraph.dygraph_to_static.utils import ORIGI_INFO from paddle.fluid.framework import Program try: from collections.abc import Sequence except: from collections import Sequence -# NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node. -ORIGI_INFO = "Original information of source code for ast node." -ORIGI_INFO_MAP = "Original information map of source code." - class Location(object): """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py index 2b95f346ae275..3eadd455e1033 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py @@ -188,9 +188,7 @@ def visit(self, node): Self-defined visit for appending ancestor """ self.ancestor_nodes.append(node) - method = 'visit_' + node.__class__.__name__ - visitor = getattr(self, method, self.generic_visit) - ret = visitor(node) + ret = super(ReturnTransformer, self).visit(node) self.ancestor_nodes.pop() return ret diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 1c507ab23c311..9f390252f3a2c 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -42,6 +42,8 @@ GET_ARGS_FUNC_PREFIX = 'get_args' SET_ARGS_FUNC_PREFIX = 'set_args' ARGS_NAME = '__args' +# NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node. +ORIGI_INFO = "Original information of source code for ast node." class BaseNodeVisitor(gast.NodeVisitor): @@ -541,35 +543,6 @@ def create_assign_node(name, node): return targets, assign_node -class RenameTransformer(gast.NodeTransformer): - - def __init__(self, node): - assert isinstance( - node, gast.AST), "RenameTransformer only accepts gast.AST as input" - self.root = node - self.old_name = "" - self.new_name = "" - - def rename(self, old_name, new_name): - self.old_name = old_name - self.new_name = new_name - self.visit(self.root) - - def visit_Name(self, node): - self.generic_visit(node) - if node.id == self.old_name: - node.id = self.new_name - return node - - def visit_Attribute(self, node): - self.generic_visit(node) - attr_full_name = get_attribute_full_name(node) - if attr_full_name == self.old_name: - new_name_node = gast.parse(self.new_name).body[0].value - return new_name_node - return node - - def ast_to_func(ast_root, dyfunc, delete_on_exit=True): """ Transform modified AST of decorated function into python callable object. @@ -897,613 +870,6 @@ def get_compare_nodes_with_tensor(self): return self._compare_node_tenor_set -class NameNodeReplaceTransformer(gast.NodeTransformer): - """ - This class replaces specified gast.Name node by replace_node. - """ - - def __init__(self, root_node, target_name, replace_node): - assert isinstance(target_name, str) - - # NOTE(liym27): - # Use gast.Name to replace gast.Name, otherwise, errors may occur. - # - # For examples: - # If using a gast.Subscript to replace gast.Name, and the original gast.Name - # is in the arguments of FunctionDef, an exception will be raised. - # - # ``` - # def func(x[i])) # x[i] can not be a argument - # # ... - # ``` - - assert isinstance(replace_node, gast.Name) - self.target_name = target_name - self.replace_node = replace_node - - self.visit(root_node) - - def visit_Name(self, node): - if node.id == self.target_name: - return self.replace_node - return node - - def visit_Nonlocal(self, node): - names = node.names - - def replace(s): - if s == self.target_name: return self.replace_node.id - return s - - node.names = list(map(replace, names)) - return node - - -class ForLoopTuplePreTransformer(gast.NodeTransformer): - """ - ForNodeVisitor parses 3 type statements (Here var is VarBase(Tensor) or python variable): - 1). for x in range(var[*]|var.numpy()[*]) - 2). for x in var|var.numpy() - 3). for i, x in enumerate(var|var.numpy()) - - We chose these 3 types because they are easier (x can be variable name iterating in var). - However, users can write tuples in Python for loop, such as - 1). for var1, var2 in var|var.numpy() - 2). for t in enumerate(var|var.numpy()) - 2). for i, (var1, var2, va3) in enumerate(var|var.numpy()) - - To handle these case, this method will do the rewrite tuple pre-process: - 1). Non-enumerate case: for var1, var2 in var|var.numpy() will be re-written as: - for FOR_ITER_TUPLE_PREFIX_x in var | var.numpy(): - var1 = FOR_ITER_TUPLE_PREFIX_x[0] - var2 = FOR_ITER_TUPLE_PREFIX_x[1] - 2). Enumerate out tuple case: for t in enumerate(var|var.numpy) will be rewritten as: - for FOR_ITER_TUPLE_INDEX_PREFIX_x, FOR_ITER_TUPLE_PREFIX_x in enumerate(var|var.numpy): - t = (FOR_ITER_TUPLE_INDEX_PREFIX_x, FOR_ITER_TUPLE_PREFIX_x) - 3). Enumerate inner tuple case: for i, (var1, (var2, va3)) in enumerate(var|var.numpy()) will - be re-written as: - for i, FOR_ITER_TUPLE_PREFIX_x in var | var.numpy(): - var1 = FOR_ITER_TUPLE_PREFIX_x[0] - var2 = FOR_ITER_TUPLE_PREFIX_x[1][0] - var3 = FOR_ITER_TUPLE_PREFIX_x[1][1] - """ - - def __init__(self, wrapper_root): - self.wrapper_root = wrapper_root - self.root = wrapper_root.node - - def transform(self): - self.visit(self.root) - - def visit_For(self, node): - if self.is_for_enumerate_iter(node): - if isinstance(node.target, (gast.Name, gast.Attribute)): - # Out tuple case - out_tuple_name = ast_to_source_code(node.target).strip() - tuple_iter_name = unique_name.generate( - FOR_ITER_TUPLE_INDEX_PREFIX) - tuple_var_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX) - node.target = gast.Tuple(elts=[ - gast.Name(id=tuple_iter_name, - ctx=gast.Store(), - annotation=None, - type_comment=None), - gast.Name(id=tuple_var_name, - ctx=gast.Store(), - annotation=None, - type_comment=None) - ], - ctx=gast.Store()) - node.body.insert( - 0, - gast.Assign(targets=[ - gast.Name(id=out_tuple_name, - ctx=gast.Store(), - annotation=None, - type_comment=None) - ], - value=gast.Tuple(elts=[ - gast.Name(id=tuple_iter_name, - ctx=gast.Load(), - annotation=None, - type_comment=None), - gast.Name(id=tuple_var_name, - ctx=gast.Load(), - annotation=None, - type_comment=None) - ], - ctx=gast.Load()))) - elif isinstance(node.target, (gast.List, gast.Tuple)) and len( - node.target.elts) >= 2 and isinstance( - node.target.elts[1], (gast.List, gast.Tuple)): - # Inner tuple case - inner_tuple_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX) - origin_inner_tuple_node = node.target.elts[1] - node.target.elts[1] = gast.Name(id=inner_tuple_name, - ctx=gast.Store(), - annotation=None, - type_comment=None) - node.body[0:0] = self.tuple_to_stmts(origin_inner_tuple_node, - inner_tuple_name) - elif self.is_for_iter(node) and isinstance(node.target, - (gast.List, gast.Tuple)): - # Non-enumrate case: - tuple_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX) - origin_tuple_node = node.target - node.target = gast.Name(id=tuple_name, - ctx=gast.Store(), - annotation=None, - type_comment=None) - node.body[0:0] = self.tuple_to_stmts(origin_tuple_node, tuple_name) - return node - - def tuple_to_stmts(self, node, tuple_name, idx=[]): - if not isinstance(node, (gast.Tuple, gast.List)): - value_node_str = tuple_name - for i in idx: - value_node_str = value_node_str + "[{}]".format(i) - - node_str = ast_to_source_code(node).strip() - assign_node_str = "{} = {}".format(node_str, value_node_str) - assign_node = gast.parse(assign_node_str).body[0] - return [assign_node] - - # isinstance(node, (gast.Tuple, gast.List)) - ret = [] - for i, element in enumerate(node.elts): - ret += self.tuple_to_stmts(node.elts[i], tuple_name, idx + [i]) - return ret - - def is_for_iter(self, for_node): - assert isinstance(for_node, - gast.For), "Input node is not gast.For node." - if isinstance(for_node.iter, (gast.Name, gast.Attribute)): - return True - elif isinstance(for_node.iter, gast.Call) and isinstance( - for_node.iter.func, - gast.Attribute) and for_node.iter.func.attr == 'numpy': - return True - elif isinstance(for_node.iter, gast.Subscript): - return True - else: - return False - - def is_for_enumerate_iter(self, for_node): - assert isinstance(for_node, - gast.For), "Input node is not gast.For node." - return isinstance(for_node.iter, gast.Call) and isinstance( - for_node.iter.func, - gast.Name) and for_node.iter.func.id == "enumerate" - - -class ForNodeVisitor(object): - """ - This class parses python for statement, get transformed 3 statement components of for node - three key statements: - 1). init_stmts: list[node], prepare nodes of for loop, may not only one - 2). cond_stmt: node, condition node to judge whether continue loop - 3). body_stmts: list[node], updated loop body, sometimes we should change - the original statement in body, not just append new statement - - In this process, the semantics of for does not change. - - Now only can parse 3 type statements (Here var is VarBase(Tensor) or python variable): - 1). for x in range(var[*]|var.numpy()[*]) - 2). for x in var|var.numpy() - 3). for i, x enumerate(var|var.numpy()) - """ - - def __init__(self, for_node): - assert isinstance( - for_node, gast.For - ), "Input node for the initialization of ForNodeVisitor is not gast.For node." - # 1. original for node - self.node = for_node - - # 2. gast.For node main parts - self.target = for_node.target - # NOTE: type may be Node or list[Node] - self.iter_args = for_node.iter if self.is_for_iter( - ) else for_node.iter.args - self.body = for_node.body - - # 3. key shared node or names - # - x: - # - for x in range(***) - # - for x in var|var.numpy() - # - for i, x enumerate(var|var.numpy()) - self.iter_var_name = self._get_iter_var_name() - - # - created index var to slice Variable: __for_loop_var_index_0 - # - for x in var|var.numpy() - # - for i, x enumerate(var|var.numpy()) - self.iter_idx_name = unique_name.generate(FOR_ITER_INDEX_PREFIX) - - # - created shape var to build loop condition: __for_loop_var_len_0 - # - for x in var|var.numpy() - # - for i, x enumerate(var|var.numpy()) - # - for x in var - self.iter_var_len_name = unique_name.generate(FOR_ITER_VAR_LEN_PREFIX) - # - created zip to list var : __for_loop_iter_zip_0 - self.iter_zip_to_list_name = unique_name.generate( - FOR_ITER_ZIP_TO_LIST_PREFIX) - - # - var.numpy()/var - # - for x in var|var.numpy() - # - for i, x enumerate(var|var.numpy()) - self.iter_node = self._get_iter_node() - - # - enumeate i: - # - for i, x enumerate(var|var.numpy()) - self.enum_idx_name = self._get_enum_idx_name() - - # - range/enumerate args length - self.args_length = None - - def parse(self): - self._args_check() - if self.is_for_range_iter(): - return self._parse_for_range_stmts() - elif self.is_for_iter(): - return self._parse_for_stmts() - elif self.is_for_enumerate_iter(): - return self._parse_for_enumerate_stmts() - else: - return None - - def is_for_range_iter(self): - return isinstance(self.node.iter, gast.Call) and isinstance( - self.node.iter.func, - gast.Name) and self.node.iter.func.id == "range" - - def is_for_iter(self): - if isinstance(self.node.iter, - (gast.Name, gast.Attribute, gast.List, gast.Tuple)): - return True - elif isinstance(self.node.iter, gast.Call) and isinstance( - self.node.iter.func, - gast.Attribute) and self.node.iter.func.attr == 'numpy': - return True - elif isinstance(self.node.iter, gast.Subscript): - return True - else: - return False - - def is_for_enumerate_iter(self): - return isinstance(self.node.iter, gast.Call) and isinstance( - self.node.iter.func, - gast.Name) and self.node.iter.func.id == "enumerate" - - def _args_check(self): - if self.is_for_range_iter(): - self.args_length = len(self.iter_args) - assert self.args_length >= 1 and self.args_length <= 3, "range() function takes 1 to 3 arguments" - elif self.is_for_enumerate_iter(): - self.args_length = len(self.iter_args) - assert self.args_length >= 1 and self.args_length <= 2, "enumerate() function takes 1 to 2 arguments" - else: - self.args_length = None - - def _parse_for_range_stmts(self): - init_stmts = [] - init_stmts.append(self._build_index_init_node()) - - compare_node = self._build_compare_node() - step_node = self._build_step_node() - cond_stmt = self._build_cond_stmt(step_node, compare_node) - - body_stmts = self.body - body_stmts.append(self._build_index_increase_node(step_node)) - - return init_stmts, cond_stmt, body_stmts - - def _parse_for_stmts(self): - init_stmts = [] - init_stmts.extend(self._build_iter_node()) - init_stmts.append(self._build_index_init_node()) - init_stmts.append(self._build_var_len_assign_node()) - - compare_node = self._build_compare_node() - step_node = self._build_step_node() - cond_stmt = self._build_cond_stmt(step_node, compare_node) - - body_stmts = self.body - - # NOTE(liym27): Here add a gast.Assign, and the target of it is gast.Name. - # In NameNodeReplaceTransformer, using gast.Name to replace gast.Name is safe. - target_node, assign_node = self._build_assign_var_slice_node() - body_stmts[0:0] = [assign_node] - for body_node in body_stmts: - NameNodeReplaceTransformer(body_node, self.iter_var_name, - target_node) - body_stmts.append(self._build_index_increase_node(step_node)) - - return init_stmts, cond_stmt, body_stmts - - def _parse_for_enumerate_stmts(self): - init_stmts = [] - init_stmts.extend(self._build_iter_node()) - init_stmts.append(self._build_index_init_node()) - init_stmts.append(self._build_var_len_assign_node()) - init_stmts.append(self._build_enum_init_node()) - - compare_node = self._build_compare_node() - step_node = self._build_step_node() - cond_stmt = self._build_cond_stmt(step_node, compare_node) - - body_stmts = self.body - - target_node, assign_node = self._build_assign_var_slice_node() - body_stmts[0:0] = [assign_node] - for body_node in body_stmts: - NameNodeReplaceTransformer(body_node, self.iter_var_name, - target_node) - - body_stmts.append(self._build_index_increase_node(step_node)) - body_stmts.append(self._build_enum_increase_node()) - - return init_stmts, cond_stmt, body_stmts - - def _build_index_init_node(self): - if self.is_for_range_iter(): - if self.args_length == 1: - index_init_value_str = '0' - else: - index_init_value_str = ast_to_source_code( - self.iter_args[0]).strip() - - index_init_var_name = self.iter_var_name - else: - index_init_value_str = '0' - index_init_var_name = self.iter_idx_name - - index_init_node_source_str = "{target} = {value}".format( - target=index_init_var_name, value=index_init_value_str) - - index_init_node = gast.parse(index_init_node_source_str).body[0] - - return index_init_node - - def _build_var_len_assign_node(self): - # get the length of iterable variable - if isinstance(self.iter_node, gast.Call) and isinstance( - self.iter_node.func, - gast.Attribute) and self.iter_node.func.attr == 'numpy': - iter_var_name = ast_to_source_code( - self.iter_node.func.value).strip() - else: - iter_var_name = ast_to_source_code(self.iter_node).strip() - - convert_len_node_source_str = '{} = _jst.Len({})'.format( - self.iter_var_len_name, iter_var_name) - - convert_len_node = gast.parse(convert_len_node_source_str).body[0] - - return convert_len_node - - def _build_iter_node(self): - """ - Process special cases for iter_node inclue: - - Case 1 (for zip): - - - for i, val in enumerate(zip(x, y)) # original code: - - - __for_loop_iter_zip_0 = list(zip(x, y)) - - for i, val in enumerate(__for_loop_iter_zip_0) - """ - new_nodes = [] - if isinstance(self.iter_node, gast.Call) and isinstance( - self.iter_node.func, gast.Name): - if self.iter_node.func.id == 'zip': - iter_var_name = ast_to_source_code(self.iter_node).strip() - zip_to_list_str = "{target} = list({value})".format( - target=self.iter_zip_to_list_name, value=iter_var_name) - zip_to_list_node = gast.parse(zip_to_list_str).body[0] - new_nodes.append(zip_to_list_node) - - self.iter_node = gast.Name(id=self.iter_zip_to_list_name, - ctx=gast.Load(), - annotation=None, - type_comment=None) - - return new_nodes - - def _build_enum_init_node(self): - if self.is_for_enumerate_iter() and self.args_length != 1: - init_value_str = ast_to_source_code(self.iter_args[1]).strip() - else: - init_value_str = '0' - - enum_init_node_source_str = "{} = {}".format(self.enum_idx_name, - init_value_str) - enum_init_node = gast.parse(enum_init_node_source_str).body[0] - return enum_init_node - - def _build_compare_node(self): - if self.is_for_range_iter(): - compare_node = self.iter_args[ - 0] if self.args_length == 1 else self.iter_args[1] - else: - compare_node = gast.Name(id=self.iter_var_len_name, - ctx=gast.Load(), - annotation=None, - type_comment=None) - return compare_node - - def _build_step_node(self): - if self.is_for_range_iter(): - step_node = self.iter_args[ - 2] if self.args_length == 3 else gast.Constant(value=1, - kind=None) - else: - step_node = gast.Constant(value=1, kind=None) - return step_node - - def _build_cond_stmt(self, step_node, compare_node): - if not isinstance(step_node, (gast.Constant, gast.UnaryOp)): - raise NotImplementedError( - "Dynamic-to-Static only supports the step value is a constant or negative constant in 'for-range' statements, " - "such as '2', '-3'. But received: '{}'. Please fix code to be compatible with Dynamic-to-Static." - .format(ast_to_source_code(step_node).strip())) - - if isinstance(step_node, gast.UnaryOp) or step_node.value < 0: - # eg: - # range(max, min, -2) - # -> - # i > min - return gast.Compare(left=gast.Name( - id=self.iter_var_name - if self.is_for_range_iter() else self.iter_idx_name, - ctx=gast.Load(), - annotation=None, - type_comment=None), - ops=[gast.Gt()], - comparators=[compare_node]) - else: - # eg: - # range(min, max, 2) - # -> - # i < max - return gast.Compare(left=gast.Name( - id=self.iter_var_name - if self.is_for_range_iter() else self.iter_idx_name, - ctx=gast.Load(), - annotation=None, - type_comment=None), - ops=[gast.Lt()], - comparators=[compare_node]) - - def _build_index_increase_node(self, step_node): - return gast.AugAssign(target=gast.Name( - id=self.iter_var_name - if self.is_for_range_iter() else self.iter_idx_name, - ctx=gast.Store(), - annotation=None, - type_comment=None), - op=gast.Add(), - value=step_node) - - def _build_assign_var_slice_node(self): - var_slice_str = "{}[{}]".format( - ast_to_source_code(self.iter_node).strip(), self.iter_idx_name) - var_slice_node = gast.parse(var_slice_str).body[0].value - new_iter_var_name = unique_name.generate(FOR_ITER_VAR_NAME_PREFIX) - target_node, assign_node = create_assign_node(new_iter_var_name, - var_slice_node) - return target_node, assign_node - - def _build_enum_increase_node(self): - return gast.AugAssign(target=gast.Name(id=self.enum_idx_name, - ctx=gast.Store(), - annotation=None, - type_comment=None), - op=gast.Add(), - value=gast.Constant(value=1, kind=None)) - - def _get_iter_var_name(self): - if self.is_for_range_iter(): - return self.target.id - elif self.is_for_iter(): - return self.target.id - elif self.is_for_enumerate_iter(): - return self.target.elts[1].id - return None - - def _get_iter_node(self): - if self.is_for_iter(): - return self.iter_args - elif self.is_for_enumerate_iter(): - return self.iter_args[0] - return None - - def _get_enum_idx_name(self): - if self.is_for_enumerate_iter(): - return self.target.elts[0].id - return None - - -class SplitAssignTransformer(gast.NodeTransformer): - """ - This class transforms sequence assignments and multi-target assignments to normal assignments. - """ - - def __init__(self, ast_node): - assert isinstance(ast_node, gast.AST) - self.ast_root = ast_node - - def transform(self): - self.visit(self.ast_root) - - def visit_Assign(self, node): - target_nodes = node.targets - if len(target_nodes) == 1: - node = self._parse_sequence_assign(node) - else: - node = self._parse_multi_target_assign(node) - return node - - def _parse_sequence_assign(self, node): - """ - a, b = c, d - -> - a = c - b = d - """ - assert isinstance(node, gast.Assign) - - target_nodes = node.targets - value_node = node.value - if not isinstance(target_nodes[0], (gast.List, gast.Tuple)): - return node - if not isinstance(value_node, (gast.List, gast.Tuple)): - return node - - targets = node.targets[0].elts - values = node.value.elts - if len(targets) != len(values): - return node - - new_nodes = [] - for target, value in zip(targets, values): - assign_node = gast.Assign(targets=[target], value=value) - new_nodes.append(assign_node) - - return new_nodes - - def _parse_multi_target_assign(self, node): - """ - Example 1: - a = b = c - -> - b = c - a = b - - Example 2: - a, b = c, d = x - -> - c,d = x - a = c - b = d - """ - assert isinstance(node, gast.Assign) - - target_nodes = node.targets - value_node = node.value - new_nodes = [] - for target in reversed(target_nodes): - assign_node = gast.Assign(targets=[target], value=value_node) - # NOTE: Because assign_node can be sequence assign statement like `a,b = c,d`, - # it's necessary to visit this new assign_node - parsed_node = self.visit_Assign(assign_node) - if not isinstance(parsed_node, list): - parsed_node = [parsed_node] - - new_nodes.extend(parsed_node) - value_node = target - - return new_nodes - - # NOTE: inspect.unwrap() exits in PY3 but not in PY2. def unwrap(func): """ From 7f958728e561ca2aa99f3c0a04cb60ff537a3d7e Mon Sep 17 00:00:00 2001 From: Wilber Date: Fri, 8 Jul 2022 20:34:50 +0800 Subject: [PATCH 109/250] Inference support mixed-precision model [3] (#44057) --- paddle/fluid/inference/analysis/argument.h | 3 + .../inference/analysis/ir_pass_manager.cc | 3 + .../ir_passes/tensorrt_subgraph_pass.cc | 105 ++++++++++- .../passes/convert_to_mixed_precision.cc | 37 ++-- paddle/fluid/inference/api/analysis_config.cc | 10 + .../fluid/inference/api/analysis_predictor.cc | 2 + .../inference/api/paddle_analysis_config.h | 11 ++ .../inference/api/paddle_pass_builder.cc | 4 + .../tensorrt/convert/affine_channel_op.cc | 20 +- .../inference/tensorrt/convert/conv2d_op.cc | 23 ++- .../inference/tensorrt/convert/conv3d_op.cc | 8 +- .../tensorrt/convert/deformable_conv_op.cc | 21 ++- .../tensorrt/convert/elementwise_op.cc | 7 +- .../tensorrt/convert/emb_eltwise_layernorm.cc | 93 ++++++---- .../fluid/inference/tensorrt/convert/fc_op.cc | 67 ++++--- .../tensorrt/convert/group_norm_op.cc | 23 +-- .../tensorrt/convert/layer_norm_op.cc | 56 +++--- .../tensorrt/convert/multihead_matmul_op.cc | 6 +- .../inference/tensorrt/convert/op_converter.h | 32 +--- .../convert/preln_emb_eltwise_layernorm.cc | 3 +- .../tensorrt/convert/preln_residual_bias.cc | 4 +- .../tensorrt/convert/preln_skip_layernorm.cc | 3 +- .../inference/tensorrt/convert/prelu_op.cc | 29 ++- .../tensorrt/convert/skip_layernorm.cc | 70 ++++--- .../tensorrt/convert/sparse_fc_op.cc | 10 +- .../convert/sparse_multihead_matmul_op.cc | 6 +- .../fluid/inference/tensorrt/convert/utils.h | 45 +++++ paddle/fluid/inference/tensorrt/engine.cc | 175 ++++++++++++++++-- paddle/fluid/inference/tensorrt/engine.h | 28 ++- .../inference/tensorrt/test_dynamic_engine.cc | 2 + .../operators/tensorrt/tensorrt_engine_op.h | 10 + .../tensorrt/tensorrt_engine_op_test.cc | 3 + 32 files changed, 651 insertions(+), 268 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/utils.h diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index e69a1e0e1ffb0..717737749a96b 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -331,6 +331,9 @@ struct Argument { // mixed precision related DECL_ARGUMENT_FIELD(model_precision, ModelPrecision, int); + DECL_ARGUMENT_FIELD(mixed_black_list, + MixedBlackList, + std::unordered_set); private: std::unordered_set valid_fields_; diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 4aeaefa3c49c3..3c04638003cdd 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -87,6 +87,9 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("with_dynamic_shape", new bool(with_dynamic_shape)); pass->Set("model_precision", new int(argument->model_precision())); + pass->Set( + "mixed_black_list", + new std::unordered_set(argument->mixed_black_list())); if (pass_name == "graph_viz_pass") { std::string optim_cache_dir = argument->optim_cache_dir(); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 7a9c5b889d146..d39eadc7cc8f1 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -13,26 +13,117 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/subgraph_detector.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/inference/utils/io_utils.h" +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" namespace paddle { namespace inference { namespace analysis { +namespace { + +bool IsFloat(framework::proto::VarType::Type t) { + if (t == framework::proto::VarType::FP16 || + t == framework::proto::VarType::FP32 || + t == framework::proto::VarType::FP64 || + t == framework::proto::VarType::BF16) + return true; + return false; +} + +// if in mixed model precision, we should make all tensorrt_engine's output +// floats dtype to float32 dtype. +void OutputProcess(framework::ir::Graph *graph, + const std::unordered_set &trt_outputs, + phi::Backend backend, + phi::DataType precision, + const std::unordered_set &blacklist) { + framework::BlockDesc *block_desc{nullptr}; + int suffix = 0; + std::unordered_map + var_to_cast_op_map; + + framework::proto::VarType::Type to_type; + if (precision == phi::DataType::FLOAT16) { + to_type = framework::proto::VarType::FP16; + } else if (precision == phi::DataType::BFLOAT16) { + to_type = framework::proto::VarType::BF16; + } else if (precision == phi::DataType::FLOAT32) { + return; + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "mixed_precision currently not supported dtype %d, we now only support " + "fp16 and bf16.", + static_cast(precision))); + } + + for (auto *op_node : framework::ir::TopologySortOperations(*graph)) { + if (!op_node->IsOp()) continue; + auto op_type = op_node->Op()->Type(); + if (op_type == "feed") block_desc = op_node->Op()->Block(); + if (op_type != "tensorrt_engine") continue; + for (auto *var_node : op_node->outputs) { + if (!trt_outputs.count(var_node)) continue; + if (!var_node->Var()->Persistable() && + IsFloat(var_node->Var()->GetDataType()) && + var_node->Var()->GetDataType() != framework::proto::VarType::FP32) { + for (auto *next_op : var_node->outputs) { + // if next_op support mixed_precision, we need to add cast op. + if (OpSupportPrecision( + phi::TransToPhiKernelName(next_op->Op()->Type()), + backend, + precision, + blacklist)) { + AddCastOp(graph, + var_node, + next_op, + framework::proto::VarType::FP32, + to_type, + &suffix, + block_desc, + &var_to_cast_op_map); + var_node->Var()->SetDataType(framework::proto::VarType::FP32); + } + } + } + } + } +} + +} // namespace using framework::ir::Node; void analysis::TensorRtSubgraphPass::ApplyImpl( framework::ir::Graph *graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph); + + auto model_precision = + static_cast(Get("model_precision")); + if (model_precision == phi::DataType::BFLOAT16) { + LOG(WARNING) + << "Paddle-TRT not support bf16 mixed precison, just fallback."; + return; + } + auto enable_int8 = Get("enable_int8"); auto use_calib_mode = Get("use_calib_mode"); bool no_calib_int8 = enable_int8 && !(use_calib_mode); @@ -181,15 +272,25 @@ void TensorRtSubgraphPass::CreateTensorRTOp( } } + auto model_precision = + static_cast(Get("model_precision")); + auto mixed_black_list = + Get>("mixed_black_list"); + std::set output_names; std::set output_names_with_id; std::map origin_name_output_dims; + std::unordered_set trt_outputs; for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); origin_name_output_dims[x->Name()] = x->Var()->GetShape().size(); + trt_outputs.insert(x); } + OutputProcess( + graph, trt_outputs, phi::Backend::GPU, model_precision, mixed_black_list); + std::unordered_map output_name_map; std::unordered_map graph_var_map; @@ -285,6 +386,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime); op_desc->SetAttr("shape_range_info_path", shape_range_info_path); op_desc->SetAttr("use_inspector", Get("use_inspector")); + op_desc->SetAttr("model_precision", Get("model_precision")); // we record all inputs' shapes in attr to check if they are consistent // with the real inputs' shapes retrieved from scope when trt runs. @@ -404,7 +506,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( min_input_shape, max_input_shape, opt_input_shape, - disable_trt_plugin_fp16); + disable_trt_plugin_fp16, + static_cast(Get("model_precision"))); trt_engine->SetUseOSS(Get("use_varseqlen")); trt_engine->SetWithInterleaved(Get("with_interleaved")); trt_engine->SetTransformerPosid( diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index 44e36647646fe..bc753636d2c1a 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" @@ -379,27 +380,21 @@ void ConvertToMixedPrecision(const std::string& model_file, }; std::unordered_set weights_should_be_fp32; - for (auto* node : paddle::framework::ir::TopologySortOperations(*graph)) { - if (!node->IsOp()) continue; - auto* op_desc = node->Op(); - if (op_desc->Type() == "feed" || op_desc->Type() == "fetch") continue; - - if (op_desc->Type() == "batch_norm") { - auto vecs = op_desc->Input("Bias"); - for (auto s : vecs) { - weights_should_be_fp32.insert(s); - } - vecs = op_desc->Input("Mean"); - for (auto s : vecs) { - weights_should_be_fp32.insert(s); - } - vecs = op_desc->Input("Scale"); - for (auto s : vecs) { - weights_should_be_fp32.insert(s); - } - vecs = op_desc->Input("Variance"); - for (auto s : vecs) { - weights_should_be_fp32.insert(s); + for (auto* node : graph->Nodes()) { + if (!node->IsVar()) continue; + if (node->Var()->GetType() == + paddle::framework::proto::VarType::SELECTED_ROWS || + node->Var()->GetType() == + paddle::framework::proto::VarType::LOD_TENSOR || + node->Var()->GetType() == + paddle::framework::proto::VarType::LOD_TENSOR_ARRAY || + node->Var()->GetType() == paddle::framework::proto::VarType::STRINGS || + node->Var()->GetType() == paddle::framework::proto::VarType::VOCAB) { + if (node->Var()->Persistable() && + node->Var()->GetDataType() == + paddle::framework::proto::VarType::FP32) { + VLOG(2) << "weights keep to fp32: " << node->Name(); + weights_should_be_fp32.insert(node->Name()); } } } diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 75a5d9ee4f55b..ae90618f5207c 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -256,6 +256,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(gpu_device_id_); CP_MEMBER(memory_pool_init_size_mb_); + // Mixed related. + CP_MEMBER(mixed_black_list_); + CP_MEMBER(enable_memory_optim_); // TensorRT related. CP_MEMBER(use_tensorrt_); @@ -871,6 +874,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << ipu_available_memory_proportion_; ss << ipu_enable_half_partial_; + for (auto &op : mixed_black_list_) ss << op.c_str(); return ss.str(); } @@ -1188,4 +1192,10 @@ bool AnalysisConfig::tuned_tensorrt_dynamic_shape() { bool AnalysisConfig::trt_allow_build_at_runtime() { return trt_allow_build_at_runtime_; } + +void AnalysisConfig::Exp_SetBlackListOpsForMixedModel( + const std::unordered_set &black_list) { + mixed_black_list_ = black_list; +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 62f89e300bfbd..d008355e0ed5b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1216,7 +1216,9 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); argument_.SetScopeNotOwned(scope_.get()); + // mixed precison. argument_.SetModelPrecision(static_cast(model_precision_)); + argument_.SetMixedBlackList(config_.mixed_black_list_); } // NOTE All the members in AnalysisConfig should be copied to Argument. diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 6de23e930836a..08d0e073babc1 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -914,6 +914,14 @@ struct PD_INFER_DECL AnalysisConfig { const DistConfig& dist_config() const { return dist_config_; } + /// + /// \brief Set a list of operators that do not support mixed precision. This + /// interface is in the experimental stage and may change in the future. Note + /// that the blacklist must be the same as the model conversion blacklist. + /// + void Exp_SetBlackListOpsForMixedModel( + const std::unordered_set& black_list); + protected: // Update the config. void Update(); @@ -926,6 +934,9 @@ struct PD_INFER_DECL AnalysisConfig { mutable std::string prog_file_; mutable std::string params_file_; + // Mixed precision. + std::unordered_set mixed_black_list_; + // GPU related. bool use_gpu_{false}; int gpu_device_id_{0}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 73c216290dd88..0d918446ea92a 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -160,6 +160,10 @@ const std::vector kGpuLowerPrecisionPasses{ const std::vector kTrtLowerPrecisionPasses{ // "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", + "trt_map_matmul_v2_to_mul_pass", + "trt_map_matmul_v2_to_matmul_pass", + "trt_map_matmul_to_mul_pass", + "fc_fuse_pass", "tensorrt_subgraph_pass", }; diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc index 44283f4e0d7e9..017fa8800b458 100644 --- a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc @@ -50,22 +50,26 @@ class AffineChannelOpConverter : public OpConverter { auto* scale_v = scope.FindVar(scale_name); auto* scale_t = scale_v->GetMutable(); - float* scale_ptr = engine_->GetWeightCPUData(scale_name, scale_t); + float* scale_ptr = const_cast(static_cast( + engine_->GetFp32TrtWeight(scale_name, *scale_t).get().values)); auto* bias_v = scope.FindVar(bias_name); auto* bias_t = bias_v->GetMutable(); - float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t); + float* bias_ptr = const_cast(static_cast( + engine_->GetFp32TrtWeight(bias_name, *bias_t).get().values)); // tensorrt scalend layer only support spatial dims >= 2, // so nhwc is not availabe (spatial dims == 0) const int channel_axis = engine_->with_dynamic_shape(); - TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, - static_cast(scale_ptr), - (size_t)idim.d[channel_axis]}; - TensorRTEngine::Weight bias_weights{nvinfer1::DataType::kFLOAT, - static_cast(bias_ptr), - (size_t)idim.d[channel_axis]}; + TensorRTEngine::Weight scale_weights{ + nvinfer1::DataType::kFLOAT, + static_cast(scale_ptr), + static_cast(idim.d[channel_axis])}; + TensorRTEngine::Weight bias_weights{ + nvinfer1::DataType::kFLOAT, + static_cast(bias_ptr), + static_cast(idim.d[channel_axis])}; TensorRTEngine::Weight power_weights{ nvinfer1::DataType::kFLOAT, nullptr, 0}; diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 244078dc344a2..c47f6d03cd543 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/phi/common/data_type.h" namespace paddle { namespace framework { @@ -48,7 +50,7 @@ void ConvertConv2d(TensorRTEngine* engine, platform::errors::NotFound("Can not find %s presistale var in scope.", filter_var_name)); auto* Y_t = Y_v->GetMutable(); - float* weight_data = nullptr; + bool enable_int8 = op_desc.HasAttr("enable_int8"); if (enable_int8) { @@ -57,7 +59,6 @@ void ConvertConv2d(TensorRTEngine* engine, engine->SetTensorDynamicRange(X, in_scale); #endif } - weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t); PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL, @@ -104,21 +105,19 @@ void ConvertConv2d(TensorRTEngine* engine, nv_post_paddings.d[1] = paddings[3]; } - TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, - static_cast(weight_data), - static_cast(Y_t->numel())}; - float* bias_data = nullptr; - size_t bias_size = 0; + auto weight = engine->GetTrtWeight(op_desc.Input("Filter").front(), *Y_t); + + TensorRTEngine::Weight bias; + bias.SetDataType(weight.get().type); + bias.SetCount(0); + bias.SetValues(nullptr); if (op_desc.Type() == "conv2d_fusion") { auto* bias_tensor = scope.GetVar(op_desc.Input("Bias").front()); auto* bias_tensor_data = bias_tensor->GetMutable(); - bias_data = engine->GetWeightCPUData(op_desc.Input("Bias").front(), - bias_tensor_data); - bias_size = static_cast(bias_tensor_data->numel()); + bias = + engine->GetTrtWeight(op_desc.Input("Bias").front(), *bias_tensor_data); } - TensorRTEngine::Weight bias{ - nvinfer1::DataType::kFLOAT, static_cast(bias_data), bias_size}; // In conv2d_transpose and depthwise_conv2d_transpose, // output channels = filter_dims[1] * groups auto* layer = (op_desc.Type() == "conv2d_transpose" || diff --git a/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc index 7308d44bf8320..4ffc805654727 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv3d_op.cc @@ -48,14 +48,12 @@ void ConvertConv3d(TensorRTEngine* engine, platform::errors::NotFound("Can not find %s presistale var in scope.", filter_var_name)); auto* Y_t = Y_v->GetMutable(); - float* weight_data = nullptr; bool enable_int8 = op_desc.HasAttr("enable_int8"); if (enable_int8) { float in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")); engine->SetTensorDynamicRange(X, in_scale); } - weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t); PADDLE_ENFORCE_EQ(Y_t->dims().size(), 5UL, @@ -85,14 +83,12 @@ void ConvertConv3d(TensorRTEngine* engine, nvinfer1::Dims3 nv_strides(strides[0], strides[1], strides[2]); nvinfer1::Dims3 nv_paddings(paddings[0], paddings[1], paddings[2]); - TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, - static_cast(weight_data), - static_cast(Y_t->numel())}; + auto weight = engine->GetTrtWeight(op_desc.Input("Filter").front(), *Y_t); float* bias_data = nullptr; size_t bias_size = 0; TensorRTEngine::Weight bias{ - nvinfer1::DataType::kFLOAT, static_cast(bias_data), bias_size}; + weight.get().type, static_cast(bias_data), bias_size}; // In conv3d_transpose output channels = filter_dims[1] * groups auto* layer = (op_desc.Type() == "conv3d_transpose") ? fadd_layer(X, n_input * groups, nv_ksize, weight, bias) diff --git a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc index f0a82bebc7ca9..8cf7f6528e595 100644 --- a/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/deformable_conv_op.cc @@ -49,8 +49,6 @@ class DeformableConvOpConverter : public OpConverter { auto* filter_var = scope.FindVar(filter_name); auto* filter_tensor = filter_var->GetMutable(); - float* filter_data = engine_->GetWeightCPUData(filter_name, filter_tensor); - const int c_o = filter_tensor->dims()[0]; const int c_i = filter_tensor->dims()[1]; const int k_h = filter_tensor->dims()[2]; @@ -73,15 +71,20 @@ class DeformableConvOpConverter : public OpConverter { weights.count = filter_tensor->numel(); bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); if (with_fp16) { - auto half_filter_data = new half[filter_tensor->numel()]; - for (int i = 0; i < filter_tensor->numel(); i++) { - half_filter_data[i] = static_cast(filter_data[i]); + auto filter_weight = engine_->GetTrtWeight(filter_name, *filter_tensor); + if (filter_weight.get().type == nvinfer1::DataType::kFLOAT) { + auto half_filter_data = new half[filter_tensor->numel()]; + for (int i = 0; i < filter_tensor->numel(); i++) { + half_filter_data[i] = static_cast( + static_cast(filter_weight.get().values)[i]); + } + weights.type = nvinfer1::DataType::kHALF; + weights.values = half_filter_data; + } else if (filter_weight.get().type == nvinfer1::DataType::kHALF) { + weights = filter_weight.get(); } - weights.type = nvinfer1::DataType::kHALF; - weights.values = half_filter_data; } else { - weights.type = nvinfer1::DataType::kFLOAT; - weights.values = filter_data; + weights = engine_->GetFp32TrtWeight(filter_name, *filter_tensor).get(); } auto* deformable_conv_plugin = new plugin::DeformableConvPlugin( with_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index aff23343a078f..365523508f5df 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -33,12 +33,9 @@ class ElementwiseTensorOpConverter : public OpConverter { if (Y_v) { // Y is weight auto* Y_t = Y_v->GetMutable(); - float* weight_data = - engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t); std::vector dims_y = phi::vectorize(Y_t->dims()); - TensorRTEngine::Weight y_weight{nvinfer1::DataType::kFLOAT, - static_cast(weight_data), - static_cast(Y_t->numel())}; + auto y_weight = engine_->GetTrtWeight(op_desc.Input("Y").front(), *Y_t); + nvinfer1::Dims trt_dims_y; trt_dims_y.nbDims = dims_y.size(); for (int i = 0; i < trt_dims_y.nbDims; i++) { diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 1a1f72388e40e..5020b97627753 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -10,8 +10,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/utils.h" +#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h" +#include "paddle/phi/core/ddim.h" namespace paddle { namespace framework { @@ -73,27 +76,39 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { // input_embs[0]: word_embedding // input_embs[1]: pos_embedding // input_embs[2]: sent_embedding - std::vector input_embs; + std::vector input_embs; std::vector emb_sizes; // get the presistable var's data - auto get_persistable_data = [&](const std::string& var_name, - framework::DDim* dims) -> float* { + auto GetWeight = [&](const std::string& var_name, + framework::DDim* dim) -> TensorRTEngine::Weight { auto* temp_var = scope.FindVar(var_name); auto* temp_tensor = temp_var->GetMutable(); - (*dims) = temp_tensor->dims(); + *dim = temp_tensor->dims(); + auto weight = engine_->GetTrtWeight(var_name, *temp_tensor); + return weight; + }; - auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor); - return temp_data; + auto GetFp32Weight = [&](const std::string& var_name, + framework::DDim* dim) -> TensorRTEngine::Weight { + auto* temp_var = scope.FindVar(var_name); + auto* temp_tensor = temp_var->GetMutable(); + *dim = temp_tensor->dims(); + auto weight = engine_->GetFp32TrtWeight(var_name, *temp_tensor); + return weight; }; int hidden = 0; for (int i = 0; i < input_num; i++) { framework::DDim emb_dims; - float* emb_data = get_persistable_data(emb_names[i], &emb_dims); - int64_t emb_size = phi::product(emb_dims); - input_embs.push_back(emb_data); - emb_sizes.push_back(emb_size); + TensorRTEngine::Weight weight; + if (flag_varseqlen) { + weight = GetWeight(emb_names[i], &emb_dims); + } else { + weight = GetFp32Weight(emb_names[i], &emb_dims); + } + input_embs.push_back(weight.get()); + emb_sizes.push_back(weight.get().count); PADDLE_ENFORCE_EQ( emb_dims.size(), 2, @@ -103,11 +118,15 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { } framework::DDim bias_dims, scale_dims; + TensorRTEngine::Weight bias_weight, scale_weight; + if (flag_varseqlen) { + bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims); + scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims); + } else { + bias_weight = GetFp32Weight(op_desc.Input("Bias").front(), &bias_dims); + scale_weight = GetFp32Weight(op_desc.Input("Scale").front(), &scale_dims); + } - auto* bias = - get_persistable_data(op_desc.Input("Bias").front(), &bias_dims); - auto* scale = - get_persistable_data(op_desc.Input("Scale").front(), &scale_dims); int64_t bias_size = phi::product(bias_dims); int64_t scale_size = phi::product(scale_dims); nvinfer1::ILayer* layer = nullptr; @@ -134,24 +153,24 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { "But Precision::KFloat32 is setted.")); const std::vector fields{ {"bert_embeddings_layernorm_beta", - bias, - nvinfer1::PluginFieldType::kFLOAT32, + bias_weight.get().values, + GetPluginFieldType(bias_weight.get().type), static_cast(bias_size)}, {"bert_embeddings_layernorm_gamma", - scale, - nvinfer1::PluginFieldType::kFLOAT32, + scale_weight.get().values, + GetPluginFieldType(scale_weight.get().type), static_cast(scale_size)}, {"bert_embeddings_word_embeddings", - input_embs[0], - nvinfer1::PluginFieldType::kFLOAT32, + input_embs[0].values, + GetPluginFieldType(input_embs[0].type), static_cast(emb_sizes[0])}, {"bert_embeddings_token_type_embeddings", - input_embs[2], - nvinfer1::PluginFieldType::kFLOAT32, + input_embs[2].values, + GetPluginFieldType(input_embs[2].type), static_cast(emb_sizes[2])}, {"bert_embeddings_position_embeddings", - input_embs[1], - nvinfer1::PluginFieldType::kFLOAT32, + input_embs[1].values, + GetPluginFieldType(input_embs[1].type), static_cast(emb_sizes[1])}, {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1}, }; @@ -235,15 +254,23 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); plugin::DynamicPluginTensorRT* plugin = nullptr; - plugin = new plugin::EmbEltwiseLayernormPluginDynamic(input_embs, - bias, - scale, - emb_sizes, - bias_size, - scale_size, - hidden, - eps, - with_fp16); + std::vector input_embs_data; + for (size_t i = 0; i < input_embs.size(); ++i) { + input_embs_data.push_back(const_cast( + static_cast(input_embs[i].values))); + } + plugin = new plugin::EmbEltwiseLayernormPluginDynamic( + input_embs_data, + const_cast( + static_cast(bias_weight.get().values)), + const_cast( + static_cast(scale_weight.get().values)), + emb_sizes, + bias_size, + scale_size, + hidden, + eps, + with_fp16); layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput( diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index ce6644cad4200..1bd9cf8712d98 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -27,6 +27,16 @@ class OpDesc; namespace paddle { namespace inference { namespace tensorrt { +namespace { +template +void tranpose_weight(const T* src, T* dst, int m, int n) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + dst[j * m + i] = src[i * n + j]; + } + } +} +} // namespace /* * FC converter convert a MUL op in Fluid to a FC layer in TRT. @@ -156,9 +166,7 @@ class FcOpConverter : public OpConverter { op_desc.HasAttr("activation_type") ? BOOST_GET_CONST(std::string, op_desc.GetAttr("activation_type")) : ""; - // This may trigger a GPU->CPU copy, because TRT's weight can only be - // assigned from CPU memory, which can't be avoided. - float* weight_data = nullptr; + bool enable_int8 = op_desc.HasAttr("enable_int8"); bool support_int8 = false; if (op_desc.HasAttr("support_int8")) { @@ -173,7 +181,6 @@ class FcOpConverter : public OpConverter { } engine_->SetTensorDynamicRange(X, in_scale); } - weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t); PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL, @@ -183,13 +190,6 @@ class FcOpConverter : public OpConverter { Y_t->dims().size())); // a matrix int m = Y_t->dims()[0]; int n = Y_t->dims()[1]; - auto tranpose_weight = [](const float* src, float* dst, int m, int n) { - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) { - dst[j * m + i] = src[i * n + j]; - } - } - }; auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output, @@ -283,11 +283,36 @@ class FcOpConverter : public OpConverter { transpose_y = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_Y")); } int weight_w, weight_h; + auto weight = engine_->GetTrtWeight(op_desc.Input(w_name).front(), *Y_t); + if (!transpose_y) { - std::vector weight_data_tmp; - weight_data_tmp.reserve(Y_t->numel()); - memcpy(weight_data_tmp.data(), weight_data, Y_t->numel() * sizeof(float)); - tranpose_weight(weight_data_tmp.data(), weight_data, m, n); + if (weight.get().type == nvinfer1::DataType::kFLOAT) { + std::vector weight_data_tmp; + weight_data_tmp.reserve(Y_t->numel()); + memcpy(weight_data_tmp.data(), + weight.get().values, + Y_t->numel() * sizeof(float)); + tranpose_weight( + weight_data_tmp.data(), + const_cast(static_cast(weight.get().values)), + m, + n); + } else if (weight.get().type == nvinfer1::DataType::kHALF) { + std::vector weight_data_tmp; + weight_data_tmp.reserve(Y_t->numel()); + memcpy(weight_data_tmp.data(), + weight.get().values, + Y_t->numel() * sizeof(float16)); + tranpose_weight(weight_data_tmp.data(), + const_cast( + static_cast(weight.get().values)), + m, + n); + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Paddle-TRT fc convert not supporte dtype, now only support fp32 " + "and fp16.")); + } weight_w = n; weight_h = m; } else { @@ -295,22 +320,14 @@ class FcOpConverter : public OpConverter { weight_h = n; } size_t n_output = weight_w; - TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, - static_cast(weight_data), - static_cast(Y_t->numel())}; weight.dims.assign({weight_w, weight_h}); - float* bias_data = nullptr; - int bias_num = 0; + TensorRTEngine::Weight bias{weight.get().type, nullptr, 0}; if (with_bias) { auto* b_v = scope.GetVar(op_desc.Input("Bias").front()); auto* b_t = b_v->GetMutable(); - bias_data = engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t); - bias_num = b_t->numel(); + bias = engine_->GetTrtWeight(op_desc.Input("Bias").front(), *b_t); } - TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, - static_cast(bias_data), - static_cast(bias_num)}; // Running the TRT Static Shape mode: x_num_col_dims-1 if (!engine_->with_dynamic_shape()) { diff --git a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc index f5a2026ff6fdf..1b45264475354 100644 --- a/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/group_norm_op.cc @@ -12,6 +12,7 @@ limitations under the License. */ #include #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" namespace paddle { namespace framework { @@ -44,30 +45,20 @@ class GroupNormOpConverter : public OpConverter { std::string bias_name = op_desc.Input("Bias").front(); // get the presistable var's data - auto get_persistable_data = [&](const std::string& var_name, - framework::DDim* dims) -> float* { + auto GetWeight = [&](const std::string& var_name, + framework::DDim* dims) -> TensorRTEngine::Weight { auto* temp_var = scope.FindVar(var_name); auto* temp_tensor = temp_var->GetMutable(); (*dims) = temp_tensor->dims(); - auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor); - return temp_data; + auto weight = engine_->GetTrtWeight(var_name, *temp_tensor); + return weight; }; framework::DDim scale_dims; framework::DDim bias_dims; - float* scale_data = get_persistable_data(scale_name, &scale_dims); - float* bias_data = get_persistable_data(bias_name, &bias_dims); - - int64_t scale_numel = phi::product(scale_dims); - int64_t bias_numel = phi::product(bias_dims); - - TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, - static_cast(scale_data), - static_cast(scale_numel)}; - TensorRTEngine::Weight bias_weights{nvinfer1::DataType::kFLOAT, - static_cast(bias_data), - static_cast(bias_numel)}; + auto scale_weights = GetWeight(scale_name, &scale_dims); + auto bias_weights = GetWeight(bias_name, &bias_dims); nvinfer1::Dims scale_nv_dims; nvinfer1::Dims bias_nv_dims; diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc index a82101e29f571..c899f4f6e777e 100644 --- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc @@ -49,20 +49,10 @@ class LayerNormOpConverter : public OpConverter { auto* Bias_t = Bias_v->GetMutable(); auto* Scale_t = Scale_v->GetMutable(); - std::unique_ptr bias_tensor( - new framework::LoDTensor()); - std::unique_ptr scale_tensor( - new framework::LoDTensor()); - - bias_tensor->Resize(Bias_t->dims()); - scale_tensor->Resize(Scale_t->dims()); - - platform::CPUPlace cpu_place; - paddle::framework::TensorCopySync((*Bias_t), cpu_place, &(*bias_tensor)); - paddle::framework::TensorCopySync((*Scale_t), cpu_place, &(*scale_tensor)); - - auto* bias_data = bias_tensor->mutable_data(platform::CPUPlace()); - auto* scale_data = scale_tensor->mutable_data(platform::CPUPlace()); + auto bias_weight = + engine_->GetFp32TrtWeight(op_desc.Input("Bias").front(), *Bias_t); + auto scale_weight = + engine_->GetFp32TrtWeight(op_desc.Input("Scale").front(), *Scale_t); nvinfer1::ILayer* layernorm_layer = nullptr; if (engine_->with_dynamic_shape()) { @@ -73,14 +63,15 @@ class LayerNormOpConverter : public OpConverter { std::vector mean_shape{input_num}; std::vector variance_shape{input_num}; plugin::LayerNormPluginDynamic* plugin = - new plugin::LayerNormPluginDynamic(bias_data, - bias_tensor->numel(), - scale_data, - scale_tensor->numel(), - begin_norm_axis, - eps, - mean_shape, - variance_shape); + new plugin::LayerNormPluginDynamic( + static_cast(bias_weight.get().values), + bias_weight.get().count, + static_cast(scale_weight.get().values), + scale_weight.get().count, + begin_norm_axis, + eps, + mean_shape, + variance_shape); layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin); } else { int input_num = 1; @@ -89,23 +80,20 @@ class LayerNormOpConverter : public OpConverter { } std::vector mean_shape{input_num}; std::vector variance_shape{input_num}; - plugin::LayerNormPlugin* plugin = - new plugin::LayerNormPlugin(bias_data, - bias_tensor->numel(), - scale_data, - scale_tensor->numel(), - begin_norm_axis, - eps, - mean_shape, - variance_shape); + plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin( + static_cast(bias_weight.get().values), + bias_weight.get().count, + static_cast(scale_weight.get().values), + scale_weight.get().count, + begin_norm_axis, + eps, + mean_shape, + variance_shape); layernorm_layer = engine_->AddPlugin( &X, 1, reinterpret_cast(plugin)); } auto output_name = op_desc.Output("Y").front(); - engine_->SetWeights(op_desc.Input("Bias").front(), std::move(bias_tensor)); - engine_->SetWeights(op_desc.Input("Scale").front(), - std::move(scale_tensor)); RreplenishLayerAndOutput( layernorm_layer, "layer_norm", {output_name}, test_mode); } diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index d30dc5eb35b15..8bc44cc6ab9d2 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -48,9 +48,11 @@ class MultiheadMatMulOpConverter : public OpConverter { in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")); engine_->SetTensorDynamicRange(input, in_scale); } - weight_data = engine_->GetWeightCPUData(weight_name, weight_t); + weight_data = const_cast(static_cast( + engine_->GetFp32TrtWeight(weight_name, *weight_t).get().values)); - float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t); + float* bias_data = const_cast(static_cast( + engine_->GetFp32TrtWeight(bias_name, *bias_t).get().values)); std::vector weight_data_tmp; weight_data_tmp.reserve(weight_t->numel()); memcpy( diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 8bcc926b856e2..0eb2bc0875fdf 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -343,6 +343,8 @@ class OpConverter { FluidDataType2TRT( var->Proto()->type().lod_tensor().tensor().data_type()), Vec2TRT_Dims(var_shape, input)); + VLOG(1) << "Set trt input [" << input << "] type is " + << var->Proto()->type().lod_tensor().tensor().data_type(); } } PADDLE_ENFORCE_EQ(all_dynamic_shape_set, @@ -561,33 +563,8 @@ class OpConverter { const std::string& name) { auto* var_v = scope.FindVar(name); auto* var_t = var_v->GetMutable(); - void* trt_ptr = nullptr; - size_t trt_num = static_cast(var_t->numel()); - nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT; - if (var_t->dtype() == phi::DataType::FLOAT32) { - float* data_ptr = engine_->GetWeightCPUData(name, var_t); - trt_ptr = static_cast(data_ptr); - } else if (var_t->dtype() == phi::DataType::INT32) { - int32_t* data_ptr = engine_->GetWeightCPUData(name, var_t); - trt_ptr = static_cast(data_ptr); - trt_dtype = nvinfer1::DataType::kINT32; - } else if (var_t->dtype() == phi::DataType::INT64) { - int64_t* data_ptr = engine_->GetWeightCPUData(name, var_t); - // We must create a new framework::Tensor() - std::unique_ptr new_var_t(new framework::Tensor()); - new_var_t->Resize({var_t->numel()}); - int32_t* new_data_ptr = - new_var_t->mutable_data(platform::CPUPlace()); - for (size_t i = 0; i < trt_num; i++) { - new_data_ptr[i] = data_ptr[i]; - } - engine_->SetWeights(name, std::move(new_var_t)); - trt_ptr = static_cast(new_data_ptr); - trt_dtype = nvinfer1::DataType::kINT32; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported datatype in TensorRT")); - } + auto weight = engine_->GetTrtWeight(name, *var_t); + // Now we have create weights, then we need create a itensor auto var_dims = var_t->dims(); nvinfer1::Dims trt_in_shape; @@ -603,7 +580,6 @@ class OpConverter { trt_in_shape.d[i] = trt_in_shape.d[i + 1]; } } - TensorRTEngine::Weight weight{trt_dtype, trt_ptr, trt_num}; nvinfer1::ILayer* layer = TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_in_shape, weight.get()); engine_->SetITensor(name, layer->getOutput(0)); diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc index 78dd812e035db..5bfa1170fa109 100644 --- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc @@ -81,7 +81,8 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter { auto* temp_tensor = temp_var->GetMutable(); (*dims) = temp_tensor->dims(); - auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor); + auto* temp_data = const_cast(static_cast( + engine_->GetFp32TrtWeight(var_name, *temp_tensor).get().values)); return temp_data; }; diff --git a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc index d09df4a4f2818..7b89b62dc8b66 100644 --- a/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc +++ b/paddle/fluid/inference/tensorrt/convert/preln_residual_bias.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.h" namespace paddle { @@ -43,7 +44,8 @@ class PrelnResidualBiasOpConverter : public OpConverter { auto* temp_var = scope.FindVar(var_name); auto* temp_tensor = temp_var->GetMutable(); (*dims) = temp_tensor->dims(); - auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor); + auto* temp_data = const_cast(static_cast( + engine_->GetFp32TrtWeight(var_name, *temp_tensor).get().values)); return temp_data; }; framework::DDim bias_dims, scale_dims, ele_bias_dims; diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc index 7824a9b23dc5e..bc9b317920755 100644 --- a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc @@ -49,7 +49,8 @@ class PrelnSkipLayerNormOpConverter : public OpConverter { auto* temp_tensor = temp_var->GetMutable(); (*dims) = temp_tensor->dims(); - auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor); + auto* temp_data = const_cast(static_cast( + engine_->GetFp32TrtWeight(var_name, *temp_tensor).get().values)); return temp_data; }; diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index 3195833c0e570..38b01eff6fb19 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -43,28 +43,21 @@ class PReluOpConverter : public OpConverter { auto* alpha_var = scope.FindVar(op_desc.Input("Alpha")[0]); auto* alpha_tensor = alpha_var->GetMutable(); + auto alpha_weight = + engine_->GetFp32TrtWeight(op_desc.Input("Alpha")[0], *alpha_tensor); + platform::CPUPlace cpu_place; - std::unique_ptr alpha_tensor_temp( - new framework::LoDTensor()); - alpha_tensor_temp->Resize(alpha_tensor->dims()); - paddle::framework::TensorCopySync( - *alpha_tensor, cpu_place, alpha_tensor_temp.get()); - float* alpha_data = alpha_tensor_temp->mutable_data(cpu_place); nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { plugin::PReluPluginDynamic* plugin = new plugin::PReluPluginDynamic( - alpha_data, alpha_tensor_temp->numel(), mode, data_format); + static_cast(alpha_weight.get().values), + alpha_tensor->numel(), + mode, + data_format); layer = engine_->AddDynamicPlugin(&input, input_num, plugin); } else { #if IS_TRT_VERSION_GE(7000) - float* alpha_weight_data = - engine_->GetWeightCPUData(op_desc.Input("Alpha")[0], alpha_tensor); - TensorRTEngine::Weight alpha_weight{ - nvinfer1::DataType::kFLOAT, - static_cast(alpha_weight_data), - static_cast(alpha_tensor->numel())}; - nvinfer1::Dims dims; dims.nbDims = 0; // jump batch dim @@ -83,13 +76,13 @@ class PReluOpConverter : public OpConverter { engine_, ParametricReLU, *input, *alpha_layer_output); #else plugin::PReluPlugin* plugin = new plugin::PReluPlugin( - alpha_data, alpha_tensor_temp->numel(), mode, data_format); + static_cast(alpha_weight.get().values), + alpha_tensor->numel(), + mode, + data_format); layer = engine_->AddPlugin(&input, input_num, plugin); #endif } - // keep alpha tensor to avoid release it's memory - engine_->SetWeights(op_desc.Input("Alpha")[0], - std::move(alpha_tensor_temp)); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 9ed72610dc179..cf95a4d9b55e0 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/utils.h" +#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h" namespace paddle { @@ -34,22 +36,6 @@ class SkipLayerNormOpConverter : public OpConverter { inputs.push_back(input1); inputs.push_back(input2); - auto get_persistable_data = [&](const std::string& arg_name, - framework::DDim* dims) -> float* { - std::string var_name = op_desc.Input(arg_name).front(); - auto* temp_var = scope.FindVar(var_name); - auto* temp_tensor = temp_var->GetMutable(); - (*dims) = temp_tensor->dims(); - - auto* temp_data = engine_->GetWeightCPUData(var_name, temp_tensor); - return temp_data; - }; - - framework::DDim bias_dims, scale_dims; - auto* bias = get_persistable_data("Bias", &bias_dims); - auto* scale = get_persistable_data("Scale", &scale_dims); - int bias_size = phi::product(bias_dims); - int scale_size = phi::product(scale_dims); bool enable_int8 = op_desc.HasAttr("enable_int8"); nvinfer1::ILayer* layer = nullptr; @@ -57,6 +43,18 @@ class SkipLayerNormOpConverter : public OpConverter { engine_->tensorrt_transformer_posid() != "" && engine_->tensorrt_transformer_maskid() != ""; if (flag_varseqlen) { + auto GetWeight = + [&](const std::string& arg_name) -> TensorRTEngine::Weight { + std::string var_name = op_desc.Input(arg_name).front(); + auto* temp_var = scope.FindVar(var_name); + auto* temp_tensor = temp_var->GetMutable(); + auto weight = engine_->GetTrtWeight(var_name, *temp_tensor); + return weight; + }; + + auto bias_weight = GetWeight("Bias").get(); + auto scale_weight = GetWeight("Scale").get(); + if (engine_->with_interleaved()) { VLOG(4) << "fused skip_layernorm op: use_varseqlen and with_interleaved"; @@ -72,11 +70,14 @@ class SkipLayerNormOpConverter : public OpConverter { platform::errors::InvalidArgument( "fail to get creator of CustomSkipLayerNormPluginDynamic")); const std::vector fields{ - {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size}, + {"beta", + bias_weight.values, + GetPluginFieldType(bias_weight.type), + static_cast(bias_weight.count)}, { "gamma", - scale, - nvinfer1::PluginFieldType::kFLOAT32, - scale_size }}; + scale_weight.values, + GetPluginFieldType(scale_weight.type), + static_cast(scale_weight.count) }}; nvinfer1::PluginFieldCollection* pluginPtr = static_cast( malloc(sizeof(*pluginPtr) + @@ -119,8 +120,14 @@ class SkipLayerNormOpConverter : public OpConverter { const std::vector fields{ {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1}, - {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size}, - {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size}, + {"beta", + bias_weight.values, + GetPluginFieldType(bias_weight.type), + static_cast(bias_weight.count)}, + {"gamma", + scale_weight.values, + GetPluginFieldType(scale_weight.type), + static_cast(scale_weight.count)}, }; nvinfer1::PluginFieldCollection* pluginPtr = static_cast( @@ -143,12 +150,29 @@ class SkipLayerNormOpConverter : public OpConverter { layer = plugin_layer; } } else { + auto GetFp32Weight = + [&](const std::string& arg_name) -> TensorRTEngine::Weight { + std::string var_name = op_desc.Input(arg_name).front(); + auto* temp_var = scope.FindVar(var_name); + auto* temp_tensor = temp_var->GetMutable(); + auto weight = engine_->GetFp32TrtWeight(var_name, *temp_tensor); + return weight; + }; + + auto bias_weight = GetFp32Weight("Bias").get(); + auto scale_weight = GetFp32Weight("Scale").get(); + float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SkipLayerNormPluginDynamic* plugin = new plugin::SkipLayerNormPluginDynamic( - bias, scale, bias_size, scale_size, eps, with_fp16); + static_cast(bias_weight.values), + static_cast(scale_weight.values), + bias_weight.count, + scale_weight.count, + eps, + with_fp16); layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin); } diff --git a/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc index 6974e5a77006e..33801e969172a 100644 --- a/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/sparse_fc_op.cc @@ -154,7 +154,10 @@ class SparseFcOpConverter : public OpConverter { } engine_->SetTensorDynamicRange(X, in_scale); } - weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t); + weight_data = const_cast(static_cast( + engine_->GetFp32TrtWeight(op_desc.Input(w_name).front(), *Y_t) + .get() + .values)); PADDLE_ENFORCE_EQ( Y_t->dims().size(), @@ -321,7 +324,10 @@ class SparseFcOpConverter : public OpConverter { if (with_bias) { auto* b_v = scope.GetVar(op_desc.Input("Bias").front()); auto* b_t = b_v->GetMutable(); - bias_data = engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t); + bias_data = weight_data = const_cast(static_cast( + engine_->GetFp32TrtWeight(op_desc.Input("Bias").front(), *b_t) + .get() + .values)); bias_num = b_t->numel(); } // Running the TRT Static Shape mode: x_num_col_dims-1 diff --git a/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc index 7f54f97d34933..4a8d15ef0dbac 100644 --- a/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/sparse_multihead_matmul_op.cc @@ -64,9 +64,11 @@ class SparseMultiheadMatMulOpConverter : public OpConverter { in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")); engine_->SetTensorDynamicRange(input, in_scale); } - weight_data = engine_->GetWeightCPUData(weight_name, weight_t); + weight_data = const_cast(static_cast( + engine_->GetFp32TrtWeight(weight_name, *weight_t).get().values)); - float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t); + float* bias_data = const_cast(static_cast( + engine_->GetFp32TrtWeight(bias_name, *bias_t).get().values)); std::vector weight_data_tmp; weight_data_tmp.reserve(weight_t->numel()); memcpy( diff --git a/paddle/fluid/inference/tensorrt/convert/utils.h b/paddle/fluid/inference/tensorrt/convert/utils.h new file mode 100644 index 0000000000000..1415e67fbeccd --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/utils.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/inference/tensorrt/engine.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +inline nvinfer1::PluginFieldType GetPluginFieldType(nvinfer1::DataType type) { + switch (type) { +#if IS_TRT_VERSION_GE(7000) + case nvinfer1::DataType::kBOOL: + return nvinfer1::PluginFieldType::kCHAR; +#endif + case nvinfer1::DataType::kFLOAT: + return nvinfer1::PluginFieldType::kFLOAT32; + case nvinfer1::DataType::kHALF: + return nvinfer1::PluginFieldType::kFLOAT16; + case nvinfer1::DataType::kINT32: + return nvinfer1::PluginFieldType::kINT32; + case nvinfer1::DataType::kINT8: + return nvinfer1::PluginFieldType::kINT8; + default: + return nvinfer1::PluginFieldType::kUNKNOWN; + } +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 9fe8f67e6a657..a4d373e83b355 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -19,15 +19,46 @@ limitations under the License. */ #include +#include "NvInferRuntimeCommon.h" #include "cuda_runtime_api.h" // NOLINT #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/common/data_type.h" namespace paddle { namespace inference { namespace tensorrt { +void TensorRTEngine::Weight::SetDataType(phi::DataType type) { + nvinfer1::DataType nv_type; + switch (type) { + case phi::DataType::FLOAT32: + nv_type = nvinfer1::DataType::kFLOAT; + break; + case phi::DataType::FLOAT16: + nv_type = nvinfer1::DataType::kHALF; + break; + case phi::DataType::INT32: + nv_type = nvinfer1::DataType::kINT32; + break; + case phi::DataType::INT8: + nv_type = nvinfer1::DataType::kINT8; + break; +#if IS_TRT_VERSION_GE(7000) + case phi::DataType::BOOL: + nv_type = nvinfer1::DataType::kBOOL; + break; +#endif + default: + paddle::platform::errors::InvalidArgument( + "Paddle-TRT loads weighths failed, found not supported data type %s.", + type); + break; + } + w_.type = nv_type; +} + int TensorRTEngine::runtime_batch_ = 1; void TensorRTEngine::InitNetwork() { @@ -197,6 +228,18 @@ void TensorRTEngine::FreezeNetwork() { } } + // If model is mixed precision, then we should cast all float output to + // float32 precision. Otherwise, we can not confirm the output precision of + // the trt engine. + if (model_precision_ != phi::DataType::FLOAT32) { + for (int i = 0; i < network()->getNbOutputs(); ++i) { + network()->getOutput(i)->setAllowedFormats( + static_cast( + 1 << static_cast(nvinfer1::TensorFormat::kLINEAR))); + network()->getOutput(i)->setType(nvinfer1::DataType::kFLOAT); + } + } + if (use_dla_) { if (!enable_int8 && !enable_fp16) { LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you " @@ -399,26 +442,126 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { runtime_batch_ = batch_size; } -template -T *TensorRTEngine::GetWeightCPUData(const std::string &name, - framework::Tensor *weight_tensor) { - std::unique_ptr cpu_weight_tensor(new framework::Tensor()); +TensorRTEngine::Weight TensorRTEngine::GetFp32TrtWeight( + const std::string &name, const framework::Tensor &weight_tensor) { + static int name_suffix_counter = 0; + std::string name_suffix = std::to_string(name_suffix_counter); + std::string splitter = "__"; + std::string name_with_suffix = name + splitter + name_suffix; platform::CPUPlace cpu_place; - cpu_weight_tensor->Resize(weight_tensor->dims()); - paddle::framework::TensorCopySync( - *weight_tensor, cpu_place, cpu_weight_tensor.get()); - T *weight_data = cpu_weight_tensor->mutable_data(cpu_place); - SetWeights(name, std::move(cpu_weight_tensor)); - return weight_data; + PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), + 0, + platform::errors::AlreadyExists( + "The weight named %s is set into the weight map " + "twice in TRT OP converter.", + name_with_suffix)); + weight_map[name_with_suffix].reset(new framework::Tensor()); + weight_map[name_with_suffix]->Resize(weight_tensor.dims()); + + TensorRTEngine::Weight weight; + weight.SetCount(weight_tensor.numel()); + weight.SetDataType(nvinfer1::DataType::kFLOAT); + // weight_tensor.dims().; + + // if trt not support dtype, we need to cast to fp32. + if (weight_tensor.dtype() == phi::DataType::BFLOAT16) { + framework::Tensor bf16_tensor; + bf16_tensor.clear(); + paddle::framework::TensorCopySync( + weight_tensor, platform::CPUPlace(), &bf16_tensor); + weight_map[name_with_suffix]->set_type( + paddle::experimental::DataType::FLOAT32); + weight_map[name_with_suffix]->Resize(weight_tensor.dims()); + auto *fp32_data = + weight_map[name_with_suffix]->mutable_data(platform::CPUPlace()); + auto *bf16_data = bf16_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < weight_tensor.numel(); i++) { + fp32_data[i] = static_cast(bf16_data[i]); + } + } else if (weight_tensor.dtype() == phi::DataType::FLOAT16) { + framework::Tensor fp16_tensor; + fp16_tensor.clear(); + paddle::framework::TensorCopySync( + weight_tensor, platform::CPUPlace(), &fp16_tensor); + weight_map[name_with_suffix]->set_type( + paddle::experimental::DataType::FLOAT32); + weight_map[name_with_suffix]->Resize(weight_tensor.dims()); + auto *fp32_data = + weight_map[name_with_suffix]->mutable_data(platform::CPUPlace()); + auto *fp16_data = fp16_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < weight_tensor.numel(); i++) { + fp32_data[i] = static_cast(fp16_data[i]); + } + } else { + paddle::framework::TensorCopySync( + weight_tensor, cpu_place, weight_map[name_with_suffix].get()); + } + weight.SetValues(weight_map[name_with_suffix]->data()); + name_suffix_counter += 1; + return weight; } -template float *TensorRTEngine::GetWeightCPUData( - const std::string &name, framework::Tensor *weight_tensor); -template int32_t *TensorRTEngine::GetWeightCPUData( - const std::string &name, framework::Tensor *weight_tensor); +TensorRTEngine::Weight TensorRTEngine::GetTrtWeight( + const std::string &name, const framework::Tensor &weight_tensor) { + static int name_suffix_counter = 0; + std::string name_suffix = std::to_string(name_suffix_counter); + std::string splitter = "__"; + std::string name_with_suffix = name + splitter + name_suffix; + platform::CPUPlace cpu_place; + PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), + 0, + platform::errors::AlreadyExists( + "The weight named %s is set into the weight map " + "twice in TRT OP converter.", + name_with_suffix)); + + weight_map[name_with_suffix].reset(new framework::Tensor()); + weight_map[name_with_suffix]->Resize(weight_tensor.dims()); + + TensorRTEngine::Weight weight; + weight.SetCount(weight_tensor.numel()); + + // if trt not support dtype, we need to cast to fp32. + if (weight_tensor.dtype() == phi::DataType::BFLOAT16) { + framework::Tensor bf16_tensor; + bf16_tensor.clear(); + paddle::framework::TensorCopySync( + weight_tensor, platform::CPUPlace(), &bf16_tensor); + weight_map[name_with_suffix]->set_type( + paddle::experimental::DataType::FLOAT32); + auto *fp32_data = + weight_map[name_with_suffix]->mutable_data(platform::CPUPlace()); + auto *bf16_data = bf16_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < weight_tensor.numel(); i++) { + fp32_data[i] = static_cast(bf16_data[i]); + } + weight.SetDataType(phi::DataType::FLOAT32); + weight.SetValues(fp32_data); + } else if (weight_tensor.dtype() == phi::DataType::INT64) { + framework::Tensor int64_tensor; + int64_tensor.clear(); + paddle::framework::TensorCopySync( + weight_tensor, platform::CPUPlace(), &int64_tensor); + weight_map[name_with_suffix]->set_type( + paddle::experimental::DataType::INT32); + auto *int32_data = + weight_map[name_with_suffix]->mutable_data(platform::CPUPlace()); + auto *int64_data = int64_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < weight_tensor.numel(); i++) { + int32_data[i] = int64_data[i]; + } + weight.SetDataType(phi::DataType::FLOAT32); + weight.SetValues(int32_data); + } else { + paddle::framework::TensorCopySync( + weight_tensor, cpu_place, weight_map[name_with_suffix].get()); + weight.SetDataType(weight_tensor.dtype()); + weight.SetValues(weight_map[name_with_suffix]->data()); + } -template int64_t *TensorRTEngine::GetWeightCPUData( - const std::string &name, framework::Tensor *weight_tensor); + name_suffix_counter += 1; + return weight; +} int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 5c2bb6e0ca07f..73506eb8f6244 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -25,6 +25,8 @@ limitations under the License. */ #include #include +#include "NvInferRuntimeCommon.h" +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/api/paddle_analysis_config.h" @@ -34,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/common/data_type.h" #include "paddle/utils/any.h" namespace paddle { @@ -187,6 +190,14 @@ class TensorRTEngine { } const nvinfer1::Weights& get() { return w_; } + void SetDataType(nvinfer1::DataType type) { w_.type = type; } + + void SetDataType(phi::DataType type); + + void SetValues(const void* values) { w_.values = values; } + + void SetCount(int64_t num) { w_.count = num; } + std::vector dims; private: @@ -203,6 +214,7 @@ class TensorRTEngine { const ShapeMapType max_input_shape = {}, const ShapeMapType optim_input_shape = {}, bool disable_trt_plugin_fp16 = false, + phi::DataType model_precision = phi::DataType::FLOAT32, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), @@ -213,6 +225,7 @@ class TensorRTEngine { max_input_shape_(max_input_shape), optim_input_shape_(optim_input_shape), disable_trt_plugin_fp16_(disable_trt_plugin_fp16), + model_precision_(model_precision), logger_(logger) { if (min_input_shape_.size() != 0 && max_input_shape_.size() != 0 && optim_input_shape_.size() != 0) { @@ -407,6 +420,14 @@ class TensorRTEngine { quant_dynamic_range_[tensor] = range; } + // Get fp32 trt weight. If src weight is not fp32, we will cast. + Weight GetFp32TrtWeight(const std::string& name, + const framework::Tensor& weight_tensor); + + // if the src weight type is fp16, then return fp16 trt weight, etc. + Weight GetTrtWeight(const std::string& name, + const framework::Tensor& weight_tensor); + float GetTensorDynamicRange(nvinfer1::ITensor* tensor) { return quant_dynamic_range_[tensor]; } @@ -415,10 +436,6 @@ class TensorRTEngine { return quant_dynamic_range_.count(tensor); } - template - T* GetWeightCPUData(const std::string& name, - framework::Tensor* weight_tensor); - // A pointer to CPU memory is needed of the TRT weight. // Before TRT runs, fluid loads weight into GPU storage. // so we need to copy the weights from GPU to CPU in our op converter. @@ -669,6 +686,7 @@ class TensorRTEngine { ShapeMapType max_input_shape_; ShapeMapType optim_input_shape_; bool disable_trt_plugin_fp16_{false}; + phi::DataType model_precision_{phi::DataType::FLOAT32}; bool use_varseqlen_{false}; bool use_dla_{false}; int dla_core_{0}; @@ -756,6 +774,7 @@ class TRTEngineManager { const std::map> max_input_shape = {}, const std::map> optim_input_shape = {}, bool disable_trt_plugin_fp16 = false, + phi::DataType model_precision = phi::DataType::FLOAT32, nvinfer1::ILogger& logger = NaiveLogger::Global()) { auto* p = new TensorRTEngine(max_batch, max_workspace, @@ -766,6 +785,7 @@ class TRTEngineManager { max_input_shape, optim_input_shape, disable_trt_plugin_fp16, + model_precision, logger); engines_[name].reset(p); return p; diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc index 8f20ffb5e6b8c..eae1e2baf9ad1 100644 --- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/phi/common/data_type.h" #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000) #include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h" #endif @@ -66,6 +67,7 @@ class TensorRTDynamicEngineTest : public ::testing::Test { max_input_shape, optim_input_shape, false, + phi::DataType::FLOAT32, NaiveLogger::Global()); engine_->InitNetwork(); } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index b0ac285b5d38d..1cd2683796acd 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -14,7 +14,12 @@ #pragma once +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/place.h" #ifdef PADDLE_WITH_CUDA #include @@ -192,6 +197,7 @@ class TensorRTEngineOp : public framework::OperatorBase { std::map> min_input_shape_{}; std::map> max_input_shape_{}; std::map> opt_input_shape_{}; + phi::DataType model_precision_{phi::DataType::FLOAT32}; public: TensorRTEngineOp(const std::string &type, @@ -217,6 +223,7 @@ class TensorRTEngineOp : public framework::OperatorBase { if (use_static_engine_) { model_opt_cache_dir_ = Attr("model_opt_cache_dir"); } + model_precision_ = static_cast(Attr("model_precision")); if (HasAttr("dynamic_shape_names") && HasAttr("min_input_shape") && HasAttr("max_input_shape") && HasAttr("opt_input_shape")) { @@ -555,6 +562,7 @@ class TensorRTEngineOp : public framework::OperatorBase { #endif } runtime_batch = t_shape[0]; + VLOG(1) << "trt input [" << x << "] dtype is " << t.dtype(); auto type = framework::TransToProtoVarType(t.dtype()); if (type == framework::proto::VarType::FP32) { buffers[bind_index] = static_cast(t.data()); @@ -619,6 +627,8 @@ class TensorRTEngineOp : public framework::OperatorBase { num_bindings)); auto trt_type = engine->engine()->getBindingDataType(bind_index); // get adr and set type + VLOG(1) << "trt output [" << y << "] dtype is " + << TRT2FluidDataType(trt_type); buffers[bind_index] = static_cast( fluid_t->mutable_data(dev_place, TRT2FluidDataType(trt_type))); output_index += 1; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index cbe14195d4106..8e2b162babce9 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" +#include "paddle/phi/common/data_type.h" USE_NO_KERNEL_OP(tensorrt_engine); namespace paddle { @@ -132,6 +133,8 @@ void DynamicShapeTest(bool allow_build_at_runtime) { engine_op_desc.SetAttr("min_input_shape", std::vector{1, 4, 1, 1}); engine_op_desc.SetAttr("max_input_shape", std::vector{2, 4, 1, 1}); engine_op_desc.SetAttr("opt_input_shape", std::vector{2, 4, 1, 1}); + engine_op_desc.SetAttr("model_precision", + static_cast(phi::DataType::FLOAT32)); LOG(INFO) << "create engine op"; auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); From 69a4a39f92a4b832e885d219c332cf77b6320b49 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Sat, 9 Jul 2022 12:56:55 +0800 Subject: [PATCH 110/250] merge develop (#43995) --- paddle/fluid/eager/api/manual/CMakeLists.txt | 1 + .../manual/eager_manual/dygraph_forward_api.h | 3 + .../eager_manual/forwards/CMakeLists.txt | 9 +- .../eager_manual/forwards/add_n_fwd_func.cc | 109 ++++++++++++++++++ .../manual/eager_manual/nodes/CMakeLists.txt | 7 +- .../manual/eager_manual/nodes/add_n_node.cc | 78 +++++++++++++ .../api/manual/eager_manual/nodes/nodes.h | 47 ++++++++ .../final_state_generator/eager_gen.py | 6 +- paddle/phi/api/lib/api_custom_impl.cc | 43 ------- paddle/phi/api/lib/api_custom_impl.h | 4 - paddle/phi/api/yaml/legacy_backward.yaml | 7 -- .../fluid/tests/unittests/test_sum_op.py | 23 ++++ 12 files changed, 280 insertions(+), 57 deletions(-) create mode 100644 paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc create mode 100644 paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc diff --git a/paddle/fluid/eager/api/manual/CMakeLists.txt b/paddle/fluid/eager/api/manual/CMakeLists.txt index e6db90ccc5bbe..8c4ce6d2bdbf8 100644 --- a/paddle/fluid/eager/api/manual/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/CMakeLists.txt @@ -6,6 +6,7 @@ if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) set(fluid_manual_nodes ${fluid_manual_nodes} PARENT_SCOPE) + add_subdirectory(eager_manual) set(eager_manual_functions ${eager_manual_functions} diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h index 0f06831068161..f9d10600a9a00 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h +++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h @@ -16,6 +16,9 @@ #include "paddle/phi/api/include/tensor.h" +paddle::experimental::Tensor add_n_final_state_dygraph_function( + const std::vector& x); + paddle::experimental::Tensor conv2d_final_state_dygraph_function( const paddle::experimental::Tensor& input, const paddle::experimental::Tensor& filter, diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt index 0ed2f26c0b255..d71f1153e2fc0 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt @@ -1,3 +1,10 @@ +cc_library( + add_n_fwd_func + SRCS add_n_fwd_func.cc + DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + +add_dependencies(add_n_fwd_func eager_codegen) + cc_library( conv2d_fwd_function SRCS conv2d_fwd_function.cc @@ -6,5 +13,5 @@ cc_library( add_dependencies(conv2d_fwd_function eager_codegen) set(eager_manual_functions - conv2d_fwd_function + conv2d_fwd_function add_n_fwd_func PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc new file mode 100644 index 0000000000000..226197b0f84ad --- /dev/null +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h" +#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/eager_amp_auto_cast.h" +#include "paddle/fluid/eager/nan_inf_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" + +#pragma GCC diagnostic ignored "-Wunused-variable" +DECLARE_bool(check_nan_inf); + +paddle::experimental::Tensor add_n_final_state_dygraph_function( + const std::vector& x) { + // Dygraph Record Event + paddle::platform::RecordEvent dygraph_entrance_record_event( + "add_n dygraph", paddle::platform::TracerEventType::Operator, 1); + + // AMP Logic + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + VLOG(5) << "Check and Prepare For AMP"; + auto op_name = phi::TransToFluidOpName("add_n"); + paddle::small_vector, + egr::kSlotSmallVectorSize> + amp_tensors_vector = {x}; + + auto amp_dst_dtype = egr::GetAmpDestDtype(op_name, amp_tensors_vector); + + auto NEW_x = egr::EagerAmpAutoCasts("x", x, amp_dst_dtype, op_name); + + { + paddle::imperative::AutoCastGuard guard( + egr::Controller::Instance().GetCurrentTracer(), + paddle::imperative::AmpLevel::O0); + return add_n_final_state_dygraph_function(NEW_x); + } + } + + // Get Input AutoGradMeta + std::vector x_autograd_meta_vec = + egr::EagerUtils::nullable_autograd_meta(x); + std::vector* x_autograd_meta = &x_autograd_meta_vec; + // Forward API Call + VLOG(3) << "Final State Running: " + << "add_n_final_state_dygraph_function"; + auto api_result = paddle::experimental::add_n(x); + // Check NaN and Inf if needed + if (FLAGS_check_nan_inf) { + egr::CheckTensorHasNanOrInf("add_n", api_result); + } + + // Get Outputs + auto& out = api_result; + + // Get Output AutoGradMeta + egr::AutogradMeta* out_autograd_meta = egr::EagerUtils::autograd_meta(&out); + bool trace_backward = egr::Controller::Instance().HasGrad(); + bool require_any_grad = + egr::EagerUtils::ComputeRequireGrad(trace_backward, x_autograd_meta); + + // Check Inplace if needed + + // Node Creation + if (require_any_grad) { + paddle::platform::RecordEvent node_creation_record_event( + "add_n node_creation", + paddle::platform::TracerEventType::OperatorInner, + 1); + + egr::EagerUtils::PassStopGradient(false, out_autograd_meta); + + // Node Construction + auto grad_node = + std::shared_ptr(new AddNGradNodeFinal(1, 1)); + // SetAttributes if needed + + // Set TensorWrappers for Forward Inputs if needed + grad_node->SetTensorWrapperx(x); + // SetGradOutMeta & SetEdges + grad_node->SetGradOutMeta(x, 0); + // SetOutRank & SetHistory & SetGradInMeta & RetainGrad + if (out_autograd_meta) { + egr::EagerUtils::SetOutRankWithSlot(out_autograd_meta, 0); + } + if (out_autograd_meta) { + egr::EagerUtils::SetHistory(out_autograd_meta, grad_node); + } + grad_node->SetGradInMeta(out, 0); + egr::EagerUtils::CheckAndRetainGrad(out); + // Set TensorWrappers for Forward Outputs if needed + } + + // Returns + return out; +} diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt index 21642fbd6495c..fa6a9a53abae3 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt @@ -1,8 +1,13 @@ +cc_library( + add_n_node + SRCS add_n_node.cc + DEPS ${eager_deps} ${fluid_deps}) + cc_library( conv2d_nodes SRCS conv2d_nodes.cc DEPS ${eager_deps} ${fluid_deps}) set(eager_manual_nodes - conv2d_nodes + conv2d_nodes add_n_node PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc new file mode 100644 index 0000000000000..e314c0c2b5b4e --- /dev/null +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" +#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/nan_inf_utils.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/api/lib/api_custom_impl.h" +DECLARE_bool(check_nan_inf); + +paddle::small_vector, + egr::kSlotSmallVectorSize> +AddNGradNodeFinal::operator()( + paddle::small_vector, + egr::kSlotSmallVectorSize>& grads, + bool create_graph, + bool is_new_grad) { + // Fill Zero For GradIn Tensors + + // Apply Gradient Hooks + auto hooked_grads = ApplyGradientHooks(grads); + + // Collect GradIn Tensors, Attrs and Recovered TensorWrappers + auto x = egr::EagerUtils::RecoverTensorWrapper(&this->x_); + auto& out_grad = hooked_grads[0][0]; + // Prepare Grad function call + + const auto& out_metas = OutputMeta(); + paddle::small_vector, + egr::kSlotSmallVectorSize> + returns(1); + for (int i = 0; i < 1; ++i) { + out_metas[i].size() == 0 ? returns[i].resize(1) + : returns[i].resize(out_metas[i].size()); + } + + std::vector api_output_0; + api_output_0.reserve(returns[0].size()); + for (size_t i = 0; i < returns[0].size(); ++i) { + if (out_metas[0].empty() || out_metas[0][i].IsStopGradient()) { + api_output_0.push_back(nullptr); + } else { + api_output_0.push_back(&returns[0][i]); + } + } + // Call grad_api function + VLOG(3) << "Final State Running: AddNGradNodeFinal"; + + // dygraph function + for (size_t i = 0; i < returns[0].size(); i++) { + returns[0][i] = ::scale_final_state_dygraph_function( + out_grad, phi::Scalar(1.0), 0.0, true); + } + + // Check NaN and Inf id needed + if (FLAGS_check_nan_inf) { + egr::CheckTensorHasNanOrInf("add_n_grad", returns); + } + + if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns); + return returns; +} diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h index f202b64f0b709..14fe144c0094a 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/tensor_wrapper.h" +#include "paddle/fluid/imperative/tracer.h" class Conv2dGradNodeFinal : public egr::GradNodeBase { public: @@ -180,3 +181,49 @@ class Conv2dDoubleGradNodeFinal : public egr::GradNodeBase { int workspace_size_MB_; bool exhaustive_search_; }; + +class AddNGradNodeFinal : public egr::GradNodeBase { + public: + AddNGradNodeFinal() : egr::GradNodeBase() {} + AddNGradNodeFinal(size_t bwd_in_slot_num, size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} + ~AddNGradNodeFinal() override = default; + + virtual paddle::small_vector, + egr::kSlotSmallVectorSize> + operator()( + paddle::small_vector, // NOLINT + egr::kSlotSmallVectorSize>& grads, // NOLINT + bool create_graph = false, + bool is_new_grad = false) override; + std::string name() override { return "AddNGradNodeFinal"; } + + void ClearTensorWrappers() override { + for (auto& tw : x_) { + tw.clear(); + } + + SetIsTensorWrappersCleared(true); + } + + std::shared_ptr Copy() const override { + auto copied_node = + std::shared_ptr(new AddNGradNodeFinal(*this)); + return copied_node; + } + + // SetTensorWrapperX, SetTensorWrapperY, ... + void SetTensorWrapperx(const std::vector& x) { + for (const auto& eager_tensor : x) { + x_.emplace_back(egr::TensorWrapper(eager_tensor, true)); + } + } + + // SetAttributes + + private: + // TensorWrappers + std::vector x_; + + // Attributes +}; diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index a6f5a36e389a9..a3beb268cfafb 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -41,7 +41,7 @@ # and this will be fixed in the futrue. inplace_check_blacklist = set(["assign_out_"]) # # --- Black Ops list that's NO NEED to apply backward code generation -black_ops_list = ["conv2d", "conv2d_grad", "conv2d_grad_grad"] +black_ops_list = ["conv2d", "conv2d_grad", "conv2d_grad_grad", "add_n"] ########### @@ -283,6 +283,7 @@ class {} : public egr::GradNodeBase {{ #pragma once #include "paddle/fluid/eager/tensor_wrapper.h" #include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/api/manual/eager_manual/nodes/nodes.h" {} """ @@ -316,6 +317,7 @@ class {} : public egr::GradNodeBase {{ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/eager/to_static/run_program_op_func.h" #include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h" + {} {} """ @@ -1648,6 +1650,8 @@ def GenerateCode(self): namespace = self.namespace for forward_api_contents in forward_api_list: + if forward_api_contents['api'] in black_ops_list: continue + backward_api_contents = self.GetBackwardAPIContents( forward_api_contents) if backward_api_contents is None: continue diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index b68418885ca21..362c9606ebadf 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -871,49 +871,6 @@ std::tuple momentum_impl( ////////////////// Backward(grad) api impls ////////////////////// -// TODO(chenweihang): the original sum grad op can support higher-level -// differentiation, -// but if we use this impl, it will not support. We need to be able to reuse -// the autograd API here, which is not yet implemented -// TODO(chenweihang): we should support call generated api in custom api impl -void add_n_grad_impl(const std::vector& x, - const Tensor& out_grad, - std::vector x_grad) { - auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - - Backend kernel_backend = kernel_key.backend(); - DataLayout kernel_layout = kernel_key.layout(); - DataType kernel_data_type = kernel_key.dtype(); - - auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "scale", {kernel_backend, kernel_layout, kernel_data_type}); - VLOG(6) << "add_n_grad API kernel key: [" << kernel_backend << ", " - << kernel_layout << ", " << kernel_data_type << "]"; - VLOG(6) << "add_n_grad API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); - - auto dense_out_grad = PrepareData(out_grad, kernel.InputAt(0), {}); - - auto dense_x_grad = SetKernelOutput(&x_grad); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const phi::DenseTensor&, - const phi::Scalar&, - float, - bool, - phi::DenseTensor*); - auto* kernel_fn = kernel.GetVariadicKernelFn(); - - for (auto* dense_x_grad_t : dense_x_grad) { - phi::MetaTensor meta_out(dense_x_grad_t); - phi::UnchangedInferMeta(MakeMetaTensor(*dense_out_grad), &meta_out); - (*kernel_fn)( - *dev_ctx, *dense_out_grad, phi::Scalar(1.0), 0.0, true, dense_x_grad_t); - } -} - std::tuple batch_norm_impl( const Tensor& x, const Tensor& scale, diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 627ff2aabf11c..ef695580a0773 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -116,10 +116,6 @@ std::tuple momentum_impl( ////////////////// Backward(grad) api impls ////////////////////// -void add_n_grad_impl(const std::vector& x, - const Tensor& out_grad, - std::vector x_grad); - void conv2d_grad_impl(const Tensor& input, const Tensor& filter, const Tensor& out_grad, diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 4af32c7e4cfa0..f01598e643420 100644 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -71,13 +71,6 @@ backward : add_double_grad inplace : (out_grad -> x_grad) -- backward_api : add_n_grad - forward : add_n (Tensor[] x) -> Tensor(out) - args : (Tensor[] x, Tensor out_grad) - output : Tensor[](x_grad){x.size()} - invoke : add_n_grad_impl(x, out_grad, x_grad) - no_need_buffer : x - - backward_api : add_triple_grad forward : add_double_grad (Tensor y, Tensor grad_out, Tensor grad_grad_x, Tensor grad_grad_y, int axis = -1) -> Tensor(grad_grad_out) args : (Tensor grad_grad_x, Tensor grad_grad_y, Tensor grad_grad_out_grad, int axis = -1) diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index 9d1a4cf19eb07..ad226878f7ef1 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -384,6 +384,29 @@ def test_dygraph_final_state_api(self): self.assertEqual( (input1.grad.numpy() == expected_grad_result).all(), True) + def test_add_n_and_add_and_grad(self): + with fluid.dygraph.guard(): + np_x = np.array([[1, 2, 3], [4, 5, 6]]) + np_y = [[7, 8, 9], [10, 11, 12]] + np_z = [[1, 1, 1], [1, 1, 1]] + x = paddle.to_tensor(np_x, dtype='float32', stop_gradient=False) + y = paddle.to_tensor(np_y, dtype='float32', stop_gradient=False) + z = paddle.to_tensor(np_z, dtype='float32') + + out1 = x + z + out2 = y + z + out = paddle.add_n([out1, out2]) + + dx, dy = paddle.grad([out], [x, y], create_graph=True) + + expected_out = np.array([[10., 12., 14.], [16., 18., 20.]]) + expected_dx = np.array([[1, 1, 1], [1, 1, 1]]) + expected_dy = np.array([[1, 1, 1], [1, 1, 1]]) + + self.assertTrue(np.allclose(out, expected_out)) + self.assertTrue(np.allclose(dx, expected_dx)) + self.assertTrue(np.allclose(dy, expected_dy)) + class TestRaiseSumError(unittest.TestCase): From cbb956b328d2f5be8e46d978c207623f55ea7917 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 11 Jul 2022 10:17:24 +0800 Subject: [PATCH 111/250] Fix overwrite in where_index (#44181) --- paddle/phi/kernels/funcs/select_impl.cu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h index a036f27cc2b80..831e0ca907b3c 100644 --- a/paddle/phi/kernels/funcs/select_impl.cu.h +++ b/paddle/phi/kernels/funcs/select_impl.cu.h @@ -395,7 +395,6 @@ void SelectKernel(const KPDevice &dev_ctx, paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); // 1.1 get stored data num of per block - int total_true_num = 0; // init const int kVecSize = 4; #ifdef PADDLE_WITH_XPU_KP int block = 64; @@ -424,6 +423,7 @@ void SelectKernel(const KPDevice &dev_ctx, DenseTensor cumsum_mem = phi::Empty(dev_ctx, dims_array); CT *cumsum_data = cumsum_mem.data(); // 2.2 get prefix of count_data for real out_index + CT total_true_num = static_cast(0); // init const int kCumVesize = 2; const int block_c = 256; const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c)); @@ -448,7 +448,7 @@ void SelectKernel(const KPDevice &dev_ctx, if (SelectData == 1) { out->Resize(phi::make_ddim(out_dim)); } else if (SelectData == 0) { // == 0 where_index - out_dim.push_back(rank); + out_dim.push_back(static_cast(rank)); out->Resize(phi::make_ddim(out_dim)); } auto out_data = out->mutable_data(cuda_place); From 1dc55942cd7db109ae1f80a14bbd3fa29e447f32 Mon Sep 17 00:00:00 2001 From: Leo Guo <58431564+ZibinGuo@users.noreply.github.com> Date: Mon, 11 Jul 2022 10:27:57 +0800 Subject: [PATCH 112/250] Update the latest dependence of KL-SDK in xpu.cmake and modify the code (#44158) style in test_iou_similarity_op_xpu.py. *test=kunlun --- cmake/external/xpu.cmake | 4 ++-- .../fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index d75af71203bf9..c5eecef3abdec 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220703") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220706") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() @@ -19,7 +19,7 @@ endif() if(NOT DEFINED XPU_XDNN_BASE_URL) set(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220703") + set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220706") else() set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py index 56ad05505a3ac..46e82c68321ee 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py @@ -13,11 +13,6 @@ # limitations under the License. from __future__ import print_function -import unittest -import sys - -sys.path.append("..") - import unittest import numpy as np import numpy.random as random From ee5cb5f2fefdbf9b10222f35a5f44f8481d4a1e0 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Sun, 10 Jul 2022 22:13:24 -0500 Subject: [PATCH 113/250] fix undefined-variable (#44187) --- python/paddle/hapi/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index 31a430789d636..16b3646a4a81a 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -465,7 +465,7 @@ def _run(self, inputs, labels=None): idx] == core.VarDesc.VarType.FP16: if isinstance(feed[n], core.LoDTensor): feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16) - elif isinstance(feed[n], numpy.array): + elif isinstance(feed[n], np.array): feed[n] = feed[n].astype('float16') if labels is not None: From c57e12bec4ce78db3db8165b6e9821d4fddd660c Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Sun, 10 Jul 2022 22:58:45 -0500 Subject: [PATCH 114/250] refine dist_grad kernel (#44182) * refine dist_grad kernel * fix cpu kernel bug --- paddle/phi/kernels/cpu/dist_grad_kernel.cc | 22 -- paddle/phi/kernels/dist_grad_kernel.cc | 93 ++++++++ paddle/phi/kernels/gpu/dist_grad_kernel.cu | 26 -- .../phi/kernels/impl/dist_grad_kernel_impl.h | 223 ------------------ 4 files changed, 93 insertions(+), 271 deletions(-) delete mode 100644 paddle/phi/kernels/cpu/dist_grad_kernel.cc create mode 100644 paddle/phi/kernels/dist_grad_kernel.cc delete mode 100644 paddle/phi/kernels/gpu/dist_grad_kernel.cu delete mode 100644 paddle/phi/kernels/impl/dist_grad_kernel_impl.h diff --git a/paddle/phi/kernels/cpu/dist_grad_kernel.cc b/paddle/phi/kernels/cpu/dist_grad_kernel.cc deleted file mode 100644 index c1aaa2adf7563..0000000000000 --- a/paddle/phi/kernels/cpu/dist_grad_kernel.cc +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/dist_grad_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h" - -PD_REGISTER_KERNEL( - dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc new file mode 100644 index 0000000000000..ba468ad299e4c --- /dev/null +++ b/paddle/phi/kernels/dist_grad_kernel.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/dist_grad_kernel.h" + +#include +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/elementwise_subtract_kernel.h" +#include "paddle/phi/kernels/p_norm_grad_kernel.h" +#include "paddle/phi/kernels/reduce_sum_kernel.h" +#include "paddle/phi/kernels/scale_kernel.h" + +namespace phi { + +std::pair, std::vector> GetReduceDims( + const DDim& src_dim, const DDim& dst_dim) { + std::vector reduce_dims, new_dims; + auto pre_dims = src_dim.size() - dst_dim.size(); + for (auto i = 0; i < pre_dims; ++i) { + reduce_dims.push_back(i); + } + + for (auto i = pre_dims; i < src_dim.size(); ++i) { + if (dst_dim[i - pre_dims] == 1 && src_dim[i] != 1) { + reduce_dims.push_back(i); + } else { + new_dims.push_back(dst_dim[i - pre_dims]); + } + } + return {reduce_dims, new_dims}; +} + +template +void DistGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& out_grad, + float p, + DenseTensor* x_grad, + DenseTensor* y_grad) { + auto t = Subtract(dev_ctx, x, y); + DenseTensor x_grad_tmp; + x_grad_tmp.Resize(t.dims()); + DenseTensor y_grad_tmp; + y_grad_tmp.Resize(t.dims()); + PNormGradKernel( + dev_ctx, t, out, out_grad, p, -1, 1e-12, false, true, &x_grad_tmp); + ScaleKernel(dev_ctx, x_grad_tmp, -1.0, 0.0, false, &y_grad_tmp); + // do reduce, the implemetation of cpu SumKernel has bug, it changes + // the dims of output iternally, so we Resize x/y_grad twice. + auto res_x = GetReduceDims(x_grad_tmp.dims(), x.dims()); + if (!std::get<0>(res_x).empty()) { + x_grad->Resize(phi::make_ddim(std::get<1>(res_x))); + SumKernel( + dev_ctx, x_grad_tmp, std::get<0>(res_x), x.dtype(), false, x_grad); + x_grad->Resize(x.dims()); + } else { + x_grad->ShareBufferWith(x_grad_tmp); + } + auto res_y = GetReduceDims(y_grad_tmp.dims(), y.dims()); + if (!std::get<0>(res_y).empty()) { + y_grad->Resize(phi::make_ddim(std::get<1>(res_y))); + SumKernel( + dev_ctx, y_grad_tmp, std::get<0>(res_y), y.dtype(), false, y_grad); + y_grad->Resize(y.dims()); + } else { + y_grad->ShareBufferWith(y_grad_tmp); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL( + dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} +#endif diff --git a/paddle/phi/kernels/gpu/dist_grad_kernel.cu b/paddle/phi/kernels/gpu/dist_grad_kernel.cu deleted file mode 100644 index df422e8b2daf9..0000000000000 --- a/paddle/phi/kernels/gpu/dist_grad_kernel.cu +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/kernels/dist_grad_kernel.h" - -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/dist_grad_kernel_impl.h" - -#ifdef PADDLE_WITH_HIP -PD_REGISTER_KERNEL(dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float) {} -#else -PD_REGISTER_KERNEL( - dist_grad, GPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {} -#endif diff --git a/paddle/phi/kernels/impl/dist_grad_kernel_impl.h b/paddle/phi/kernels/impl/dist_grad_kernel_impl.h deleted file mode 100644 index fc118a832dc9f..0000000000000 --- a/paddle/phi/kernels/impl/dist_grad_kernel_impl.h +++ /dev/null @@ -1,223 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace phi { - -template -using ETensor = phi::EigenTensor; - -template -static void GetBraodcastDims(const phi::DDim& x_dims, - const phi::DDim& y_dims, - Eigen::DSizes* x_bcast_dims, - Eigen::DSizes* y_bcast_dims) { - int bcast_dims_remainder = 0; - for (int i = 0; i < x_dims.size(); ++i) { - if (x_dims[i] >= y_dims[i]) { - (*x_bcast_dims)[i] = 1; - (*y_bcast_dims)[i] = x_dims[i] / y_dims[i]; - bcast_dims_remainder += x_dims[i] % y_dims[i]; - } else { - (*y_bcast_dims)[i] = 1; - (*x_bcast_dims)[i] = y_dims[i] / x_dims[i]; - bcast_dims_remainder += y_dims[i] % x_dims[i]; - } - } - PADDLE_ENFORCE_EQ(bcast_dims_remainder, - 0, - phi::errors::PreconditionNotMet( - "The input tensor of Op(dist) could not be broadcast, " - "X's shape is [%s], Y's shape is [%s].", - x_dims, - y_dims)); -} - -static phi::DDim GetNewDims(const phi::DDim& in_dims, int rank) { - std::vector new_dims_vec(rank); - if (in_dims.size() < rank) { - for (int i = 0; i < rank - in_dims.size(); ++i) { - new_dims_vec[i] = 1; - } - for (int i = 0; i < in_dims.size(); ++i) { - new_dims_vec[i + rank - in_dims.size()] = in_dims[i]; - } - } else { - new_dims_vec = vectorize(in_dims); - } - return phi::make_ddim(new_dims_vec); -} - -template -static void DistGradFunction(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& out_grad, - float p, - DenseTensor* x_grad, - DenseTensor* y_grad) { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - auto out_dims = out.dims(); - - phi::DDim x_new_dims = GetNewDims(x_dims, Rank); - phi::DDim y_new_dims = GetNewDims(y_dims, Rank); - phi::DDim out_new_dims = GetNewDims(out_dims, Rank); - auto x_t = ETensor::From(x, x_new_dims); - auto y_t = ETensor::From(y, y_new_dims); - auto out_t = ETensor::From(out, out_new_dims); - - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - Eigen::DSizes out_bcast_dims; - - GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); - std::vector new_dims_vec(Rank); - for (int i = 0; i < Rank; ++i) { - new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]); - out_bcast_dims[i] = new_dims_vec[i]; - } - phi::DDim new_dims = phi::make_ddim(new_dims_vec); - - auto& place = *dev_ctx.eigen_device(); - auto out_grad_t = ETensor::From(out_grad, out_new_dims); - DenseTensor grad; - grad.Resize(new_dims); - dev_ctx.template Alloc(&grad); - auto grad_t = ETensor::From(grad); - - auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims); - auto x_minux_y_abs = x_minux_y.abs(); - auto sign = - (x_minux_y > static_cast(0)).template cast() * static_cast(1.0) + - (x_minux_y < static_cast(0)).template cast() * static_cast(-1.0); - T epsilon = static_cast(1.0e-10f); - - // 1: Lp-norm(z), z = x-y, compute dz - if (p == 0) { - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, &grad, static_cast(0)); - } else if (p == INFINITY || p == -INFINITY) { - // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if - // j!=i, or equals to sign(z_i) * dout if j=i. - if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) { - grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) - .template cast() * - sign.eval() * out_grad_t.broadcast(out_bcast_dims); - } else { - grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) - .template cast() * - sign * out_grad_t.broadcast(out_bcast_dims); - } - } else { - // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout - if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) { - grad_t.device(place) = - (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) - .pow(p - 1) * - sign.eval() * out_grad_t.broadcast(out_bcast_dims); - } else { - grad_t.device(place) = - (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) - .pow(p - 1) * - sign * out_grad_t.broadcast(out_bcast_dims); - } - } - - Eigen::DSizes x_reshape_dims; - Eigen::DSizes y_reshape_dims; - Eigen::DSizes reduce_dims; - for (int i = 0; i < x_new_dims.size(); ++i) { - x_reshape_dims[2 * i] = x_bcast_dims[i]; - x_reshape_dims[2 * i + 1] = x_new_dims[i]; - y_reshape_dims[2 * i] = y_bcast_dims[i]; - y_reshape_dims[2 * i + 1] = y_new_dims[i]; - reduce_dims[i] = 2 * i; - } - - // 2: if x or y is broadcasted in forward function, - // the grad need to be sum along the broadcasted dimensions - if (x_grad) { - dev_ctx.template Alloc(x_grad); - auto x_grad_t = ETensor::From(*x_grad, x_new_dims); - x_grad_t.device(place) = grad_t.reshape(x_reshape_dims) - .sum(reduce_dims) - .reshape(x_grad_t.dimensions()); - } - if (y_grad) { - dev_ctx.template Alloc(y_grad); - auto y_grad_t = ETensor::From(*y_grad, y_new_dims); - y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims) - .sum(reduce_dims) - .reshape(y_grad_t.dimensions()); - } -} - -template -void DistGradKernel(const Context& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& out_grad, - float p, - DenseTensor* x_grad, - DenseTensor* y_grad) { - auto x_rank = x.dims().size(); - auto y_rank = y.dims().size(); - auto rank = std::max(x_rank, y_rank); - PADDLE_ENFORCE_LE(rank, - 6, - phi::errors::Unimplemented( - "Op(dist) only support tensors with no more than 6 " - "dimensions, but X's rank is %d, Y's rank is %d.", - x_rank, - y_rank)); - switch (rank) { - case 1: - DistGradFunction( - dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); - break; - case 2: - DistGradFunction( - dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); - break; - case 3: - DistGradFunction( - dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); - break; - case 4: - DistGradFunction( - dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); - break; - case 5: - DistGradFunction( - dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); - break; - case 6: - DistGradFunction( - dev_ctx, x, y, out, out_grad, p, x_grad, y_grad); - break; - } -} - -} // namespace phi From 13a250a22004ce4a3a28b3a7247f8cd866fb1428 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Mon, 11 Jul 2022 14:47:54 +0800 Subject: [PATCH 115/250] [AutoParallel] add 'to_static' in engine api (#44202) * add 'to_static' in engine api * fix cmakelist --- .../distributed/auto_parallel/dist_context.py | 3 + .../distributed/auto_parallel/engine.py | 159 +++++++++++++++--- .../auto_parallel/parallelizer_v2.py | 11 +- .../unittests/auto_parallel/CMakeLists.txt | 1 + .../unittests/auto_parallel/test_to_static.py | 122 ++++++++++++++ 5 files changed, 272 insertions(+), 24 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py index 2f57b0ac0e415..04b7f6aded7a5 100644 --- a/python/paddle/distributed/auto_parallel/dist_context.py +++ b/python/paddle/distributed/auto_parallel/dist_context.py @@ -125,6 +125,9 @@ def __init__(self, # A flag indicates whether the used parallelism is data parallel self._data_parallel = False + # flag whether using `to_static` + self._dygraph_mode = True + @property def serial_main_program(self): return self._serial_main_program diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 3d5b91cd7faa7..e65a51a09a16f 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -21,6 +21,7 @@ from paddle import fluid, static from paddle.io import Dataset +from paddle.jit import to_static from paddle.metric import Metric from paddle.static import InputSpec from paddle.fluid import core @@ -28,7 +29,7 @@ from paddle.fluid.layers.utils import flatten from paddle.fluid.executor import global_scope, _to_name_str from paddle.fluid.backward import append_backward -from paddle.fluid.framework import Operator +from paddle.fluid.framework import Operator, Parameter, _non_static_mode from paddle.fluid.framework import _current_expected_place as _get_device from paddle.fluid.dygraph.parallel import ParallelEnv from paddle.distributed import fleet @@ -82,6 +83,7 @@ def __init__(self, self._feed_vars = {} self._fetch_vars = {} self._planners = {} + self._dygraph_mode = False def prepare(self, optimizer=None, @@ -131,27 +133,110 @@ def prepare(self, def _build(self, mode): - serial_main_prog = self._serial_main_progs.get(mode, None) - if serial_main_prog is not None: - return - - losses = [] - metrics = [] - serial_main_prog = self._orig_main_prog.clone() - serial_startup_prog = self._orig_startup_prog.clone() - with static.program_guard(serial_main_prog, serial_startup_prog), \ - utils.unique_name.guard(): - inputs_spec = self.inputs_spec - labels_spec = self.labels_spec if self.labels_spec else [] - inputs = [s._create_feed_layer() for s in inputs_spec] - labels = [s._create_feed_layer() for s in labels_spec] - outputs = to_list(self.model(*inputs)) - if mode != "predict" and self._loss: - losses = to_list(self._loss(*(outputs + labels))) - - if mode != "predict": - for metric in self._metrics: - metrics.extend(to_list(metric.compute(*(outputs + labels)))) + if _non_static_mode() or self._dygraph_mode: + self._dygraph_mode = True + self._logger.info("Building model with 'to_static' method.") + + # build forward main program + self.static_model = to_static(self.model, + input_spec=self.inputs_spec) + inputs = self.static_model.forward.inputs + outputs = self.static_model.forward.outputs + forward_main_prog = self.static_model.forward.main_program + forward_startup_prog = self.static_model.forward.concrete_program.startup_program + self.concrete_program = self.static_model.forward.concrete_program + + # build loss main program + outputs_spec = [] + outputs_name = [] + for out in outputs: + outputs_spec.append(InputSpec(out.shape, out.dtype, out.name)) + outputs_name.append(out.name) + if isinstance(self._loss, paddle.nn.Layer): + self.static_loss = to_static(self._loss.forward, + input_spec=outputs_spec + + self.labels_spec) + loss_main_prog = self.static_loss.main_program + elif callable(self._loss): + self.static_loss = to_static(self._loss, + input_spec=outputs_spec + + self.labels_spec) + loss_main_prog = self.static_loss.main_program + + # build startup program + for param in self.concrete_program.parameters: + Parameter(name=param.name, + desc=param, + type=param.type, + shape=param.shape, + dtype=param.dtype, + stop_gradient=param.stop_gradient, + block=forward_startup_prog.global_block()) + + paddle.enable_static() + + # NOTE: pure program will loss dist_attr + # feeded_var_names = [var.name for var in inputs] + # main_prog_0 = main_prog_0._prune_with_input( + # feeded_var_names=feeded_var_names, targets=outputs) + + labels = [] + losses = [] + metrics = [] + # concat forward and loss prog + if mode != 'predict' and self._loss: + forward_block = forward_main_prog.global_block() + loss_block = loss_main_prog.global_block() + for idx, op in enumerate(loss_block.ops): + op_desc = forward_block.desc.append_op() + op_desc.copy_from(op.desc) + for in_name in op.input_arg_names: + if in_name in outputs_name: + continue + in_var = forward_block._clone_variable( + loss_block.vars[in_name], force_persistable=False) + if loss_block.vars[in_name].is_data: + labels.append(in_var) + for out_name in op.output_arg_names: + out_var = forward_block._clone_variable( + loss_block.vars[out_name], force_persistable=False) + if idx == len(loss_block.ops) - 1: + losses.append(out_var) + forward_block._sync_with_cpp() + serial_main_prog = forward_main_prog + serial_startup_prog = forward_startup_prog + # update metrics op in program + with static.program_guard(serial_main_prog, serial_startup_prog), \ + utils.unique_name.guard(): + if mode != "predict": + for metric in self._metrics: + metrics.extend( + to_list(metric.compute(*(outputs + labels)))) + + else: + # build program in static mode + serial_main_prog = self._serial_main_progs.get(mode, None) + if serial_main_prog is not None: + return + + losses = [] + metrics = [] + serial_main_prog = self._orig_main_prog.clone() + serial_startup_prog = self._orig_startup_prog.clone() + with static.program_guard(serial_main_prog, serial_startup_prog), \ + utils.unique_name.guard(): + inputs_spec = self.inputs_spec + labels_spec = self.labels_spec if self.labels_spec else [] + inputs = [s._create_feed_layer() for s in inputs_spec] + labels = [s._create_feed_layer() for s in labels_spec] + outputs = to_list(self.model(*inputs)) + if mode != "predict" and self._loss: + losses = to_list(self._loss(*(outputs + labels))) + + if mode != "predict": + for metric in self._metrics: + metrics.extend( + to_list(metric.compute(*(outputs + labels)))) default_ctx = get_default_distributed_context() if not default_ctx.has_annotation: @@ -172,6 +257,7 @@ def _build(self, mode): serial_main_prog, serial_startup_prog, self._optimizer, losses, feed_vars, fetch_vars, self.cluster, self.strategy) self._dist_contexts[mode].gradient_scale = self._gradient_scale + self._dist_contexts[mode]._dygraph_mode = self._dygraph_mode def _plan(self, mode): if self._planned_mode is None: @@ -236,6 +322,35 @@ def _initialize(self, mode): self._place = _get_device() if isinstance(self._place, fluid.CUDAPlace): self._place = fluid.CUDAPlace(ParallelEnv().dev_id) + + if self._dygraph_mode: + paddle.disable_static() + main_program = self._dist_main_progs[mode][self._cur_rank] + for param in self.concrete_program.parameters: + # create var in scope and share parameters to scope + if param.name not in main_program.global_block().vars: + continue + # get param_var's dist_attr + var = main_program.global_block().vars[param.name] + var_dist_attr = self._dist_contexts[ + mode].get_tensor_dist_attr_for_program(var) + dist_attr = { + "dims_mapping": var_dist_attr.dims_mapping, + "process_shape": var_dist_attr.process_mesh.topology, + "process_group": var_dist_attr.process_mesh.processes + } + # slice param_value with dist_attr + # share sliced_param_value with param_tensor in global_scope + from .converter import Converter + param_tensor = global_scope().var(param.name).get_tensor() + sliced_param = Converter.slice_with_dist_attr( + param.numpy(), dist_attr) + shared_tensor = paddle.to_tensor(sliced_param, + place=self._place) + param_tensor._share_data_with( + shared_tensor.value().get_tensor()) + paddle.enable_static() + if self._executor is None: self._executor = paddle.static.Executor(self._place) uninitialized = [] diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py index dce3908e75a62..d8c0da9e27056 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py +++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py @@ -15,8 +15,10 @@ import copy from collections import defaultdict +import paddle from paddle.fluid import program_guard from paddle.fluid.backward import append_backward +from paddle.fluid.framework import _non_static_mode from paddle.distributed.passes import new_pass from .reshard import Resharder @@ -110,9 +112,14 @@ def _generate_backward(self, main_program, startup_program, loss): def _generate_optimizer(self, main_program, startup_program, optimizer, params_grads): + if self._dist_context._dygraph_mode: + paddle.disable_static() + optimizer = copy.deepcopy(optimizer) + paddle.enable_static() + else: + optimizer = copy.deepcopy(optimizer) with program_guard(main_program, startup_program): - optimizer_ops = copy.deepcopy(optimizer).apply_gradients( - params_grads) + optimizer_ops = optimizer.apply_gradients(params_grads) self._completer.complete_update_annotation(main_program) return optimizer_ops diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index e0eb04e2535c5..5738412dd52ae 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -53,4 +53,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS}) py_test_modules(test_dist_context MODULES test_dist_context ENVS ${dist_ENVS}) py_test_modules(test_prim_dist_op MODULES test_prim_dist_op ENVS ${dist_ENVS}) + py_test_modules(test_to_static MODULES test_to_static ENVS ${dist_ENVS}) endif() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py new file mode 100644 index 0000000000000..4e4fb9b5825ed --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py @@ -0,0 +1,122 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import os +import numpy as np + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddle.distributed.auto_parallel as auto +import paddle.distributed.fleet as fleet + +from paddle.io import Dataset +from paddle.static import InputSpec +from paddle.fluid.framework import _non_static_mode +from paddle.distributed.auto_parallel.engine import Engine + +batch_size = 4 +batch_num = 30 +hidden_size = 1024 +class_num = 10 + + +class MyDataset(Dataset): + + def __init__(self, num_samples): + super(MyDataset, self).__init__() + self.num_samples = num_samples + + def __getitem__(self, index): + input = np.random.uniform(size=hidden_size).astype("float32") + label = np.random.randint(0, class_num - 1, dtype="int64") + return input, label + + def __len__(self): + return self.num_samples + + +class MLPLayer(nn.Layer): + + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + dropout_ratio=0.1, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=initializer_range)) + + self.linear0 = nn.Linear(d_model, + dim_feedforward, + weight_attr, + bias_attr=None) + self.linear1 = nn.Linear(dim_feedforward, + d_model, + weight_attr, + bias_attr=None) + self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=None) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train") + + def forward(self, input): + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + out = self.dropout(out) + out = self.linear2(out) + + return out + + +class TestToStatic(unittest.TestCase): + + def test_to_static(self): + + mlp = MLPLayer(hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + dropout_ratio=0.1, + initializer_range=0.02) + loss = paddle.nn.CrossEntropyLoss() + optimizer = paddle.optimizer.SGD(learning_rate=0.00001, + parameters=mlp.parameters()) + + dataset = MyDataset(batch_num * batch_size) + + inputs = InputSpec([batch_size, hidden_size], 'float32', 'x') + labels = InputSpec([batch_size], 'int64', 'label') + + engine = Engine(model=mlp, + inputs_spec=inputs, + labels_spec=labels, + strategy=None) + assert _non_static_mode() == True + + engine.prepare(optimizer=optimizer, + loss=loss, + metrics=paddle.metric.Accuracy()) + + assert _non_static_mode() == False + engine.fit(dataset, batch_size=batch_size) + engine.evaluate(dataset, batch_size=batch_size) + engine.predict(dataset, batch_size=batch_size) + + +if __name__ == "__main__": + unittest.main() From 9a3054c6244e26dba91dce728d0e18a6d58a6ad2 Mon Sep 17 00:00:00 2001 From: heliqi <1101791222@qq.com> Date: Mon, 11 Jul 2022 02:11:37 -0500 Subject: [PATCH 116/250] [Inference]ort backend optimizer (#44136) * add ort clone interface * paddle2onnx update to 1.0.0rc * ort input_tensor use mutable data of scope --- cmake/external/paddle2onnx.cmake | 2 +- .../inference/api/details/zero_copy_tensor.cc | 120 ------------------ .../inference/api/onnxruntime_predictor.cc | 68 ++++++---- .../inference/api/onnxruntime_predictor.h | 20 ++- 4 files changed, 61 insertions(+), 149 deletions(-) diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake index cbb622f5cb952..b8a1b4548b822 100644 --- a/cmake/external/paddle2onnx.cmake +++ b/cmake/external/paddle2onnx.cmake @@ -24,7 +24,7 @@ endif() include(ExternalProject) set(PADDLE2ONNX_PROJECT "extern_paddle2onnx") -set(PADDLE2ONNX_VERSION "0.9.9") +set(PADDLE2ONNX_VERSION "1.0.0rc") set(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx) set(PADDLE2ONNX_SOURCE_DIR ${THIRD_PARTY_PATH}/paddle2onnx/src/${PADDLE2ONNX_PROJECT}) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 4040d09c4519e..7bb384b27381d 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -179,13 +179,6 @@ PlaceType Tensor::place() const { return place_; } template void Tensor::CopyFromCpu(const T *data) { -#ifdef PADDLE_WITH_ONNXRUNTIME - if (is_ort_tensor_) { - ORTCopyFromCpu(data); - return; - } -#endif - EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GE(tensor->numel(), 0, @@ -731,112 +724,6 @@ void Tensor::SetOrtBuffer(const std::shared_ptr> buffer) { buffer_ = buffer; } -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, - float *data, - size_t size, - const int64_t *shape, - size_t shape_len) { - return Ort::Value::CreateTensor( - memory_info, data, size, shape, shape_len); -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, - int64_t *data, - size_t size, - const int64_t *shape, - size_t shape_len) { - return Ort::Value::CreateTensor( - memory_info, data, size, shape, shape_len); -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, - int32_t *data, - size_t size, - const int64_t *shape, - size_t shape_len) { - return Ort::Value::CreateTensor( - memory_info, data, size, shape, shape_len); -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, - uint8_t *data, - size_t size, - const int64_t *shape, - size_t shape_len) { - return Ort::Value::CreateTensor( - memory_info, data, size, shape, shape_len); -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, - int8_t *data, - size_t size, - const int64_t *shape, - size_t shape_len) { - return Ort::Value::CreateTensor( - memory_info, data, size, shape, shape_len); -} - -Ort::Value GetOrtVaule(const Ort::MemoryInfo &memory_info, - float16 *data, - size_t size, - const int64_t *shape, - size_t shape_len) { - return Ort::Value::CreateTensor(memory_info, - static_cast(data), - size * sizeof(float16), - shape, - shape_len, - ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16); -} - -template -void Tensor::ORTCopyFromCpu(const T *data) { - auto binding = binding_.lock(); - PADDLE_ENFORCE_NOT_NULL(binding, - paddle::platform::errors::PreconditionNotMet( - "input tensor [%s] no binding ptr", name_)); - const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda"; - Ort::MemoryInfo memory_info( - device_name, OrtDeviceAllocator, device_, OrtMemTypeDefault); - size_t size = std::accumulate( - begin(shape_), end(shape_), 1UL, std::multiplies()); - auto buffer = buffer_.lock(); - size_t buffer_size = size * sizeof(T); - if (buffer_size > buffer->size()) { - buffer->resize(buffer_size); - } - std::memcpy(static_cast(buffer->data()), data, buffer_size); - - auto onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; - if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8; - } else if (std::is_same::value) { - onnx_dtype = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16; - } else { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "Found undefined data type for onnxruntime, only supports " - "float16/float32/float64/int8/uint8/int32/int64.")); - } - - auto ort_value = Ort::Value::CreateTensor(memory_info, - buffer->data(), - buffer_size, - shape_.data(), - shape_.size(), - onnx_dtype); - binding->BindInput(name_.c_str(), ort_value); -} - template void Tensor::ORTCopyToCpu(T *data) const { auto binding = binding_.lock(); @@ -857,13 +744,6 @@ void Tensor::ORTCopyToCpu(T *data) const { } } -template void Tensor::ORTCopyFromCpu(const float *data); -template void Tensor::ORTCopyFromCpu(const int64_t *data); -template void Tensor::ORTCopyFromCpu(const int32_t *data); -template void Tensor::ORTCopyFromCpu(const uint8_t *data); -template void Tensor::ORTCopyFromCpu(const int8_t *data); -template void Tensor::ORTCopyFromCpu(const float16 *data); - template void Tensor::ORTCopyToCpu(float *data) const; template void Tensor::ORTCopyToCpu(int32_t *data) const; template void Tensor::ORTCopyToCpu(uint8_t *data) const; diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc index 83919ad13967d..5313db6442986 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.cc +++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc @@ -24,11 +24,10 @@ #include #include -#include "paddle/fluid//platform/device/gpu/gpu_types.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" @@ -97,6 +96,7 @@ bool ONNXRuntimePredictor::Init() { } else { place_ = paddle::platform::CPUPlace(); } + scope_.reset(new paddle::framework::Scope()); char *onnx_proto = nullptr; int out_size; @@ -147,6 +147,8 @@ bool ONNXRuntimePredictor::Init() { Ort::Allocator allocator(session_, memory_info); size_t n_inputs = session_.GetInputCount(); + framework::proto::VarType::Type proto_type = + framework::proto::VarType::LOD_TENSOR; for (size_t i = 0; i < n_inputs; ++i) { auto input_name = session_.GetInputName(i, allocator); auto type_info = session_.GetInputTypeInfo(i); @@ -155,6 +157,10 @@ bool ONNXRuntimePredictor::Init() { ONNXTensorElementDataType data_type = type_info.GetTensorTypeAndShapeInfo().GetElementType(); input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); + + auto *ptr = scope_->Var(input_name); + framework::InitializeVariable(ptr, proto_type); + allocator.Free(input_name); } @@ -249,13 +255,13 @@ bool ONNXRuntimePredictor::FindONNXDesc(const std::string &name, std::unique_ptr ONNXRuntimePredictor::GetInputTensor( const std::string &name) { - PADDLE_ENFORCE_EQ(FindONNXDesc(name, true), - true, - platform::errors::PreconditionNotMet( - "The in variable named %s is not found in the " - "ONNXPredictor.", - name)); - std::unique_ptr res(new ZeroCopyTensor(nullptr, this)); + PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), + platform::errors::PreconditionNotMet( + "The in variable named %s is not found in the " + "ONNXPredictor.", + name)); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(scope_.get()), this)); res->input_or_output_ = true; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -264,16 +270,6 @@ std::unique_ptr ONNXRuntimePredictor::GetInputTensor( auto gpu_place = place_; res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); } - res->SetOrtMark(true); - res->SetOrtBinding(binding_); - auto iter = input_buffers_.find(name); - if (iter == input_buffers_.end()) { - std::vector i_vector; - input_buffers_[name] = std::make_shared>(i_vector); - res->SetOrtBuffer(input_buffers_[name]); - } else { - res->SetOrtBuffer(iter->second); - } return res; } @@ -306,6 +302,24 @@ std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( return res; } +Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc, + const char *device_name) { + Ort::MemoryInfo memory_info( + device_name, OrtDeviceAllocator, place_.GetDeviceId(), OrtMemTypeDefault); + auto *var = scope_->FindVar(desc.name); + auto *tensor = var->GetMutable(); + size_t size = + tensor->numel() * + framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype())); + std::vector shape = phi::vectorize(tensor->dims()); + return Ort::Value::CreateTensor(memory_info, + static_cast(tensor->data()), + size, + shape.data(), + shape.size(), + desc.dtype); +} + bool ONNXRuntimePredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { @@ -315,7 +329,13 @@ bool ONNXRuntimePredictor::Run(const std::vector &inputs, bool ONNXRuntimePredictor::ZeroCopyRun() { try { - const char *device_name = place_ == PlaceType::kCPU ? "Cpu" : "Cuda"; + const char *device_name = platform::is_cpu_place(place_) ? "Cpu" : "Cuda"; + std::vector inputs; + inputs.reserve(input_desc_.size()); + for (auto desc : input_desc_) { + inputs.push_back(GetOrtValue(desc, device_name)); + binding_->BindInput(desc.name.c_str(), inputs.back()); + } for (auto output : output_desc_) { Ort::MemoryInfo out_memory_info(device_name, OrtDeviceAllocator, @@ -333,8 +353,10 @@ bool ONNXRuntimePredictor::ZeroCopyRun() { } std::unique_ptr ONNXRuntimePredictor::Clone(void *stream) { - LOG(ERROR) << "Not support Clone(), Please create new Predictor"; - return nullptr; + std::lock_guard lk(clone_mutex_); + auto *x = new ONNXRuntimePredictor(config_); + x->Init(); + return std::unique_ptr(x); } uint64_t ONNXRuntimePredictor::TryShrinkMemory() { diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h index 27ce4529a8fe8..b8f0ad0a52941 100644 --- a/paddle/fluid/inference/api/onnxruntime_predictor.h +++ b/paddle/fluid/inference/api/onnxruntime_predictor.h @@ -21,8 +21,6 @@ #include "onnxruntime_c_api.h" // NOLINT #include "onnxruntime_cxx_api.h" // NOLINT -#include "paddle/fluid/framework/naive_executor.h" -#include "paddle/fluid/framework/op_compatible_info.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h" @@ -94,7 +92,7 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// \param[in] AnalysisConfig config /// explicit ONNXRuntimePredictor(const AnalysisConfig &config) - : config_(config), env_(ORT_LOGGING_LEVEL_WARNING, "onnx") { + : env_(ORT_LOGGING_LEVEL_WARNING, "onnx"), config_(config) { predictor_id_ = inference::GetUniqueId(); } /// @@ -176,6 +174,8 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// std::unique_ptr Clone(void *stream = nullptr) override; + std::shared_ptr scope_; + protected: const void *GetDeviceContexts() const override; @@ -191,14 +191,24 @@ class ONNXRuntimePredictor : public PaddlePredictor { /// bool FindONNXDesc(const std::string &name, bool is_input); - private: - AnalysisConfig config_; + /// \brief get the Ort Value(input Tensor). + /// + /// \param[in] desc ONNXDesce(name、shape、dtype) + /// + /// \param[in] device_name "cpu" or "gpu" of device + /// + /// \return get a Ort::Value + /// + Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name); + private: // ONNXRuntime Ort::Env env_; Ort::Session session_{nullptr}; std::shared_ptr binding_; + AnalysisConfig config_; + std::mutex clone_mutex_; platform::Place place_; std::vector input_desc_; std::vector output_desc_; From 3ca713ee52a70aa0fb056d693061f9065e74c8da Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Mon, 11 Jul 2022 15:25:45 +0800 Subject: [PATCH 117/250] rmsprop for xpu. test=kunlun (#44175) * rmsprop for xpu. test=kunlun * minor fix (follow comments). test=kunlun --- cmake/external/xpu.cmake | 4 +- .../operators/optimizers/rmsprop_op_xpu.cc | 286 +++++------ .../fluid/platform/device/xpu/xpu2_op_list.h | 1 + .../white_list/no_check_set_white_list.py | 1 + .../unittests/xpu/test_rmsprop_op_xpu.py | 464 +++++++----------- 5 files changed, 313 insertions(+), 443 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index c5eecef3abdec..3228f5a556c2e 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220706") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220707") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() @@ -19,7 +19,7 @@ endif() if(NOT DEFINED XPU_XDNN_BASE_URL) set(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220706") + set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220707") else() set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc index eb987151472e2..6addb7c2febd8 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc @@ -1,141 +1,145 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_XPU - -#include - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/device_wrapper.h" - -namespace paddle { -namespace operators { - -static inline float GetAttrFromTensor(const framework::Tensor* tensor) { - const float* tensor_data = tensor->data(); - framework::Tensor cpu_tensor; - if (platform::is_gpu_place(tensor->place()) || - platform::is_xpu_place(tensor->place())) { - paddle::framework::TensorCopySync( - *tensor, platform::CPUPlace(), &cpu_tensor); - tensor_data = cpu_tensor.data(); - } - return tensor_data[0]; -} - -using framework::OpKernelType; -using framework::Tensor; - -template -class RmspropOpXPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using paddle::framework::LoDTensor; - - // check Param & Grad tensor type - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), - true, - platform::errors::InvalidArgument( - "Tensor holds the wrong type,Expected Var(%s)'s " - "type is LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - - const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), - true, - platform::errors::InvalidArgument( - "Tensor holds the wrong type,Expected Var(%s)'s " - "type is LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - - // inputs - auto& param = GET_DATA_SAFELY( - ctx.Input("Param"), "Input", "Param", "Rmsprop"); - auto& meanSquare = GET_DATA_SAFELY( - ctx.Input("MeanSquare"), "Input", "MeanSquare", "Rmsprop"); - auto& grad = GET_DATA_SAFELY( - ctx.Input("Grad"), "Input", "Grad", "Rmsprop"); - auto& mom = GET_DATA_SAFELY( - ctx.Input("Moment"), "Input", "Moment", "Rmsprop"); - - auto* learning_rate = ctx.Input("LearningRate"); - PADDLE_ENFORCE_EQ(learning_rate->dims().size(), - 1, - platform::errors::InvalidArgument( - "learining rate should have dimension = 1." - " But received learning rate dim [%s] ", - learning_rate->dims().size())); - T lr = static_cast(GetAttrFromTensor(learning_rate)); - - // constants - T epsilon = static_cast(ctx.Attr("epsilon")); - T decay = static_cast(ctx.Attr("decay")); - T momentum = static_cast(ctx.Attr("momentum")); - - // outputs - auto& param_out = GET_DATA_SAFELY( - ctx.Output("ParamOut"), "Output", "ParamOut", "Rmsprop"); - auto& mom_out = GET_DATA_SAFELY( - ctx.Output("MomentOut"), "Output", "MomentOut", "Rmsprop"); - auto& mom_sqrt_out = GET_DATA_SAFELY(ctx.Output("MeanSquareOut"), - "Output", - "MeanSquareOut", - "Rmsprop"); - auto& dev_ctx = ctx.template device_context(); - - ///// rmsprop优化算法 - /// - /// ms_out[i] = rho * ms[i] + (1 - rho) * (g[i] * g[i]); - /// - /// mom_out[i] = momentum * mom[i] + lr * - /// (g[i] / ((float)sqrt(ms_out[i] + epsilon))); - /// - /// p_out[i] = p[i] - mom_out[i]; - /// DLL_EXPORT int rmsprop(Context* ctx, const float* p, - /// const float* ms, const float* g, const float* mom, - /// float epsilon, float rho, float momentum, float lr, - /// float *ms_out, float *mom_out, float *p_out, int n) - int r = xpu::rmsprop(dev_ctx.x_context(), - grad.template data(), - param.template data(), - meanSquare.template data(), - mom.template data(), - param_out.template mutable_data(ctx.GetPlace()), - mom_sqrt_out.template mutable_data(ctx.GetPlace()), - mom_out.template mutable_data(ctx.GetPlace()), - epsilon, - decay, - momentum, - lr, - param.numel()); - - PADDLE_ENFORCE_XDNN_SUCCESS(r, "rmsprop"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL( - rmsprop, - ops::RmspropOpXPUKernel); -#endif +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include + +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +static inline float GetAttrFromTensor(const framework::Tensor* tensor) { + const float* tensor_data = tensor->data(); + framework::Tensor cpu_tensor; + if (platform::is_gpu_place(tensor->place()) || + platform::is_xpu_place(tensor->place())) { + paddle::framework::TensorCopySync( + *tensor, platform::CPUPlace(), &cpu_tensor); + tensor_data = cpu_tensor.data(); + } + return tensor_data[0]; +} + +using framework::OpKernelType; +using framework::Tensor; + +template +class RmspropOpXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using paddle::framework::LoDTensor; + + // check Param & Grad tensor type + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), + true, + platform::errors::InvalidArgument( + "Tensor holds the wrong type,Expected Var(%s)'s " + "type is LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE_EQ(grad_var->IsType(), + true, + platform::errors::InvalidArgument( + "Tensor holds the wrong type,Expected Var(%s)'s " + "type is LoDTensor, " + "but the received is %s", + ctx.InputNames("Grad").front(), + framework::ToTypeName(grad_var->Type()))); + + // inputs + auto& param = GET_DATA_SAFELY( + ctx.Input("Param"), "Input", "Param", "Rmsprop"); + auto& meanSquare = GET_DATA_SAFELY( + ctx.Input("MeanSquare"), "Input", "MeanSquare", "Rmsprop"); + auto& grad = GET_DATA_SAFELY( + ctx.Input("Grad"), "Input", "Grad", "Rmsprop"); + auto& mom = GET_DATA_SAFELY( + ctx.Input("Moment"), "Input", "Moment", "Rmsprop"); + + auto* learning_rate = ctx.Input("LearningRate"); + PADDLE_ENFORCE_EQ(learning_rate->dims().size(), + 1, + platform::errors::InvalidArgument( + "learining rate should have dimension = 1." + " But received learning rate dim [%s] ", + learning_rate->dims().size())); + T lr = static_cast(GetAttrFromTensor(learning_rate)); + + // constants + T epsilon = static_cast(ctx.Attr("epsilon")); + T decay = static_cast(ctx.Attr("decay")); + T momentum = static_cast(ctx.Attr("momentum")); + + bool centered = ctx.Attr("centered"); + PADDLE_ENFORCE_EQ(centered, + false, + platform::errors::Unimplemented( + "centered=True is not supported in the xpu kernel of " + "rmsprop. use XPU_BLACK_LIST to disable this op.")); + /* + TODO(houj04): when XDNN api supports 'center', add input of + mean_grad_input and output of mean_grad_output. auto *mean_grad_input = + ctx.Input("MeanGrad"); auto *mean_grad_output = + ctx.Output("MeanGradOut"); + */ + + // outputs + auto& param_out = GET_DATA_SAFELY( + ctx.Output("ParamOut"), "Output", "ParamOut", "Rmsprop"); + auto& mom_out = GET_DATA_SAFELY( + ctx.Output("MomentOut"), "Output", "MomentOut", "Rmsprop"); + auto& mom_sqrt_out = GET_DATA_SAFELY(ctx.Output("MeanSquareOut"), + "Output", + "MeanSquareOut", + "Rmsprop"); + auto& dev_ctx = ctx.template device_context(); + + // int rmsprop(Context* ctx, const T* g, const T* p, const float* ms, const + // float* mom, T* p_out, float* ms_out, float* mom_out, float epsilon, float + // rho, float momentum, float lr, int n); + int r = xpu::rmsprop(dev_ctx.x_context(), + grad.template data(), + param.template data(), + meanSquare.template data(), + mom.template data(), + param_out.template mutable_data(ctx.GetPlace()), + mom_sqrt_out.template mutable_data(ctx.GetPlace()), + mom_out.template mutable_data(ctx.GetPlace()), + epsilon, + decay, + momentum, + lr, + param.numel()); + + PADDLE_ENFORCE_XDNN_SUCCESS(r, "rmsprop"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + rmsprop, + ops::RmspropOpXPUKernel); +#endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 2fa287b80f451..c5a70b03cd3c8 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -363,6 +363,7 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py index ea3264ba0dbb7..fb0cb2d7a5aee 100644 --- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py @@ -36,4 +36,5 @@ 'eigvalsh', 'class_center_sample', 'einsum', + 'rmsprop', ] diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py index 9f7ca522d742b..020dbf344b68a 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py @@ -1,300 +1,164 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -import sys - -sys.path.append("..") - -import unittest -import numpy as np -import paddle.fluid.core as core -from paddle.fluid.op import Operator -from op_test_xpu import XPUOpTest -import paddle.fluid as fluid -import paddle -''' -def create_selected_rows_and_tensor(scope, place, height, row_num, - embedding_size): - sr = scope.var("@selected_rows@").get_selected_rows() - tensor = scope.var("grad").get_tensor() - - rows = np.random.random_integers( - low=0, high=height - 1, size=[row_num, ]).astype('int64') - sr_val = np.random.random(size=[row_num, embedding_size]).astype('float32') - - sr.set_height(height) - sr.set_rows(rows) - sr.get_tensor().set(sr_val, place) - - tensor_val = np.zeros(shape=[height, embedding_size], dtype='float32') - for i in range(row_num): - row = rows[i] - tensor_val[row, :] = tensor_val[row, :] + sr_val[i, :] - - tensor.set(tensor_val, place) - return tensor_val, sr_val -''' -""" -class TestBase(XPUOpTest): - op_type = 'rmsprop' - - def setup(self, - place, - is_sparse, - centered, - size, - row_num=None, - epsilon=1e-6): - - np.random.seed(5) # fix seed - - self.scope = fluid.global_scope() - self.place = place - - self.param_name = 'param' - self.param = np.random.random(size).astype('float32') - - self.mean_square_name = 'mean_square' - self.mean_square = np.random.uniform( - low=1, high=2, size=size).astype('float32') - - self.mean_grad_name = 'mean_grad' - self.mean_grad = np.random.random(size).astype('float32') - - self.lr_name = 'lr' - self.learning_rate = np.array([0.01]).astype('float32') - - self.grad_name = 'grad' - self.is_sparse = is_sparse - - self.grad = np.random.random(size).astype('float32') - grad_tensor = self.scope.var(self.grad_name).get_tensor() - grad_tensor.set(self.grad, place) - - self.moment_name = 'moment' - self.moment = np.random.uniform( - low=0, high=1, size=size).astype('float32') - - self.epsilon = epsilon - self.decay = 0.9 - self.momentum = 0.1 - self.centered = centered - - self.ms_out = self.decay * self.mean_square + (1 - self.decay - ) * self.grad * self.grad - if centered: - self.mg_out = self.decay * self.mean_grad + (1 - self.decay - ) * self.grad - self.moment_out = self.momentum * self.moment + \ - self.learning_rate * self.grad / np.sqrt(self.ms_out - np.square(self.mg_out) + self.epsilon) - else: - self.moment_out = self.momentum * self.moment + \ - self.learning_rate * self.grad / np.sqrt(self.ms_out + self.epsilon) - - self.param_out = self.param - self.moment_out - - # create and initialize Param Variable - self.param_tensor = self.scope.var(self.param_name).get_tensor() - self.param_tensor.set(self.param, place) - - self.mean_square_tensor = self.scope.var( - self.mean_square_name).get_tensor() - self.mean_square_tensor.set(self.mean_square, place) - - lr = self.scope.var(self.lr_name).get_tensor() - lr.set(self.learning_rate, place) - - self.moment_tensor = self.scope.var(self.moment_name).get_tensor() - self.moment_tensor.set(self.moment, place) - - if self.centered: - self.mean_grad_tensor = self.scope.var( - self.mean_grad_name).get_tensor() - self.mean_grad_tensor.set(self.mean_grad, place) - - def check(self, actual_t, expect_t, place, out_name, atol=1e-5): - self.assertTrue( - np.allclose( - actual_t, expect_t, atol=atol), - 'Output (' + out_name + ') has diff at ' + str(place) + '\nExpect ' - + str(expect_t) + '\n' + 'But Got' + str(actual_t)) - - -class TestRmspropOp(TestBase): - def check_with_place(self, - place, - is_sparse, - centered, - size, - row_num=None, - epsilon=1e-6): - self.setup(place, is_sparse, centered, size, row_num, epsilon) - self.run_and_check() - - def run_and_check(self): - #grad_name = self.grad_sr_name if self.is_sparse else self.grad_name - grad_name = self.grad_name - - kwargs = { - 'Param': self.param_name, - 'Grad': grad_name, - 'MeanSquare': self.mean_square_name, - 'Moment': self.moment_name, - 'LearningRate': self.lr_name, - 'ParamOut': self.param_name, - 'MeanSquareOut': self.mean_square_name, - 'MomentOut': self.moment_name, - 'epsilon': self.epsilon, - 'decay': self.decay, - 'momentum': self.momentum, - 'centered': self.centered - } - - if self.centered: - kwargs['MeanGrad'] = self.mean_grad_name - kwargs['MeanGradOut'] = self.mean_grad_name - - rmsprop_op = Operator('rmsprop', **kwargs) - atol = 1e-6 - - rmsprop_op.run(self.scope, self.place) - - self.check( - np.array(self.mean_square_tensor), - self.ms_out, - self.place, - self.mean_square_name, - atol=atol) - self.check( - np.array(self.moment_tensor), - self.moment_out, - self.place, - self.moment_name, - atol=atol) - self.check( - np.array(self.param_tensor), - self.param_out, - self.place, - self.param_name, - atol=atol) - - if self.centered: - self.check( - np.array(self.mean_grad_tensor), self.mg_out, self.place, - self.mean_grad_name) - - def test_rmsprop(self): - places = [paddle.XPUPlace(0)] - - size = (128, 320) - for place in places: - for centered in [False]: - with fluid.scope_guard(core.Scope()): - self.check_with_place( - place, is_sparse=False, centered=centered, size=size) - - with fluid.scope_guard(core.Scope()): - self.check_with_place( - place, - is_sparse=True, - centered=centered, - row_num=512, - size=size) - - with fluid.scope_guard(core.Scope()): - self.check_with_place( - place, - is_sparse=True, - centered=centered, - row_num=60, - size=size, ) - - -class TestRMSPropV2(XPUOpTest): - op_type = 'rmsprop' - - def test_rmsprop_dygraph(self): - paddle.disable_static() - value = np.arange(26).reshape(2, 13).astype('float32') - a = paddle.to_tensor(value) - linear = paddle.nn.Linear(13, 5) - # This can be any optimizer supported by dygraph. - adam = paddle.optimizer.RMSProp( - learning_rate=0.01, - parameters=linear.parameters(), - weight_decay=0.01) - out = linear(a) - out.backward() - adam.step() - adam.clear_gradients() - - def test_rmsprop(self): - place = paddle.XPUPlace(0) - paddle.enable_static() - main = fluid.Program() - with fluid.program_guard(main): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.mean(cost) - - print(avg_cost.shape) - linear = paddle.nn.Linear(13, 5) - rms_optimizer = paddle.optimizer.RMSProp( - learning_rate=0.1, parameters=linear.parameters()) - rms_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - def test_raise_error(self): - self.assertRaises(ValueError, paddle.optimizer.RMSProp, None) - self.assertRaises( - ValueError, paddle.optimizer.RMSProp, learning_rate=0.1, rho=None) - self.assertRaises( - ValueError, - paddle.optimizer.RMSProp, - learning_rate=0.1, - epsilon=None) - self.assertRaises( - ValueError, - paddle.optimizer.RMSProp, - learning_rate=0.1, - momentum=None) - - def test_rmsprop_op_invalid_input(self): - paddle.disable_static() - linear = paddle.nn.Linear(10, 10) - with self.assertRaises(ValueError): - adam = paddle.optimizer.RMSProp( - 0.1, epsilon=-1, parameters=linear.parameters()) - with self.assertRaises(ValueError): - adam = paddle.optimizer.RMSProp( - 0.1, momentum=-1, parameters=linear.parameters()) - with self.assertRaises(ValueError): - adam = paddle.optimizer.RMSProp( - 0.1, rho=-1, parameters=linear.parameters()) -""" - -if __name__ == "__main__": - paddle.enable_static() - unittest.main() +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys + +sys.path.append("..") + +import paddle +import paddle.fluid.core as core + +from op_test import OpTest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +def calculate_rmsprop_by_numpy(param, grad, mean_square, moment, learning_rate, + epsilon, decay, momentum): + mean_square_out = decay * mean_square + (1 - decay) * grad * grad + moment_out = momentum * moment + learning_rate * grad / np.sqrt( + mean_square_out + epsilon) + param_out = param - moment_out + return param_out, mean_square_out, moment_out + + +class XPUTestRMSPropOP(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'rmsprop' + self.use_dynamic_create_class = False + + class TestRMSPropOPBase(XPUOpTest): + + def setUp(self): + self.place = paddle.XPUPlace(0) + self.xpu_version = core.get_xpu_device_version(0) + self.init_dtype() + self.set_case() + + def set_case(self): + self.op_type = 'rmsprop' + self.dtype = self.in_type + self.init_config() + + self.param = np.random.uniform(-1, 1, + self.input_shape).astype(self.dtype) + self.grad = np.random.uniform(-1, 1, + self.input_shape).astype(self.dtype) + self.mean_square = np.random.uniform(0, 1, self.input_shape).astype( + self.dtype) + self.moment = np.random.uniform(-1, 1, + self.input_shape).astype(self.dtype) + + self.mean_grad = np.random.uniform(-1, 1, self.input_shape).astype( + self.dtype) + self.mean_grad_out = np.random.uniform( + -1, 1, self.input_shape).astype(self.dtype) + + param_out, mean_square_out, moment_out = calculate_rmsprop_by_numpy( + param=self.param, + grad=self.grad, + mean_square=self.mean_square, + moment=self.moment, + learning_rate=self.learning_rate, + epsilon=self.epsilon, + decay=self.decay, + momentum=self.momentum) + self.inputs = { + 'Param': self.param, + 'Grad': self.grad, + 'MeanSquare': self.mean_square, + 'Moment': self.moment, + 'LearningRate': self.learning_rate, + 'MeanGrad': self.mean_grad, + 'MeanGradOut': self.mean_grad_out, + } + self.attrs = { + 'use_xpu': True, + 'epsilon': self.epsilon, + 'decay': self.decay, + 'momentum': self.momentum, + 'centered': + False, # TODO(houj04): when XDNN api supports 'center = True', add more test cases + } + self.outputs = { + 'ParamOut': param_out, + 'MomentOut': moment_out, + 'MeanSquareOut': mean_square_out, + 'MeanGradOut': self.mean_grad_out + } + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, + no_check_set=['MeanGradOut']) + + def init_config(self): + self.input_shape = [864] + self.learning_rate = np.array([0.001]).astype(self.dtype) + self.epsilon = 1e-4 + self.decay = 0.9 + self.momentum = 0.1 + + class XPUTestRMSProp1(TestRMSPropOPBase): + + def init_config(self): + self.input_shape = [2, 768] + self.learning_rate = np.array([0.002]).astype(self.dtype) + self.epsilon = 1e-4 + self.decay = 0.9 + self.momentum = 0.1 + + class XPUTestRMSProp2(TestRMSPropOPBase): + + def init_config(self): + self.input_shape = [3, 8, 4096] + self.learning_rate = np.array([0.005]).astype(self.dtype) + self.epsilon = 1e-6 + self.decay = 0.95 + self.momentum = 0 + + class XPUTestRMSProp3(TestRMSPropOPBase): + + def init_config(self): + self.input_shape = [1024] + self.learning_rate = np.array([0.01]).astype(self.dtype) + self.epsilon = 1e-5 + self.decay = 0.99 + self.momentum = 0.02 + + class XPUTestRMSProp4(TestRMSPropOPBase): + + def init_config(self): + self.input_shape = [2, 2, 255] + self.learning_rate = np.array([0.0005]).astype(self.dtype) + self.epsilon = 1e-3 + self.decay = 0.8 + self.momentum = 0.002 + + +support_types = get_xpu_op_support_types('rmsprop') +for stype in support_types: + create_test_class(globals(), XPUTestRMSPropOP, stype) + +if __name__ == "__main__": + unittest.main() From 526be01a395412707834c90ee5d2d5397c99bad4 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Mon, 11 Jul 2022 16:05:58 +0800 Subject: [PATCH 118/250] [CustomDevice]support fast_eager_deletion_mode (#44211) --- paddle/fluid/imperative/tracer.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 4c99bfc248e88..07eb9ae6a8e5e 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -161,8 +161,14 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( #endif } else if (platform::is_custom_place(place)) { #if defined(PADDLE_WITH_CUSTOM_DEVICE) - gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0)); - VLOG(10) << "Created GarbageCollector at " << place; + if (framework::IsFastEagerDeletionModeEnabled()) { + gc.reset( + new framework::CustomDeviceUnsafeFastGarbageCollector(place, 0)); + VLOG(10) << "Created UnsafeFastGarbageCollector at " << place; + } else { + gc.reset(new framework::CustomDefaultStreamGarbageCollector(place, 0)); + VLOG(10) << "Created GarbageCollector at " << place; + } #else PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't use CustomDevice since it's not compiled with " From 826e278192c1aa18ab7b8a826d1e8ed325cca478 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Mon, 11 Jul 2022 10:19:15 +0200 Subject: [PATCH 119/250] Unify and generalize activation fuse passes (#44185) * reduce redundancy * python code style * fix int8 ut --- .../framework/ir/graph_pattern_detector.cc | 109 +++-------------- .../framework/ir/graph_pattern_detector.h | 84 +------------ .../conv_activation_mkldnn_fuse_pass.cc | 61 +++------- .../mkldnn/conv_activation_mkldnn_fuse_pass.h | 8 +- .../ir/mkldnn/elt_act_mkldnn_fuse_pass.cc | 79 ++++-------- .../ir/mkldnn/elt_act_mkldnn_fuse_pass.h | 8 +- .../ir/mkldnn/fc_act_mkldnn_fuse_pass.cc | 15 +-- .../softplus_activation_mkldnn_fuse_pass.cc | 71 ++++------- .../softplus_activation_mkldnn_fuse_pass.h | 6 +- .../mkldnn/elementwise_mkldnn_op.h | 17 +-- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 18 +-- .../operators/mkldnn/softplus_mkldnn_op.h | 36 +----- paddle/fluid/platform/mkldnn_reuse.h | 113 ++++++++++++++---- ...st_mkldnn_softplus_activation_fuse_pass.py | 28 ++--- 14 files changed, 203 insertions(+), 450 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index f0949cb9dfbd2..e811475dd83e9 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -931,65 +931,22 @@ PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input, return bn_out_var; } -PDNode *patterns::ConvActivation::operator()( - paddle::framework::ir::PDNode *conv_input, - std::string conv_type, - std::string activation_type) { - // Create Operators - conv_input->assert_is_op_input(conv_type, "Input"); - auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(conv_type); - auto *activation_op = - pattern->NewNode(activation_repr())->assert_is_op(activation_type); - // Create variables - // Filter - auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) - ->AsInput() - ->assert_is_persistable_var() - ->assert_is_op_input(conv_type, "Filter"); - // intermediate variable, will be removed in the IR after fuse. - auto *conv_out_var = pattern->NewNode(conv_out_repr()) - ->AsIntermediate() - ->assert_is_only_output_of_op(conv_type) - ->assert_is_op_input(activation_type); - // output - auto *activation_out_var = pattern->NewNode(activation_out_repr()) - ->AsOutput() - ->assert_is_op_output(activation_type); - - conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); - activation_op->LinksFrom({conv_out_var}).LinksTo({activation_out_var}); - return activation_out_var; -} - -PDNode *patterns::ElementwiseActivation::operator()( - paddle::framework::ir::PDNode *elementwise_a, - const std::string &elementwise_type, - const std::string &activation_type) { - // Create Operators - elementwise_a->assert_is_op_input(elementwise_type, "X"); - auto *elementwise_op = - pattern->NewNode(elementwise_repr())->assert_is_op(elementwise_type); +PDNode *patterns::OperatorActivation::operator()( + const std::string &operator_type, const std::string &activation_type) { + auto *preceding_op = + pattern->NewNode(preceding_op_repr())->assert_is_op(operator_type); + auto *preceding_op_out = pattern->NewNode(preceding_op_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op(operator_type) + ->assert_is_op_input(activation_type); auto *activation_op = pattern->NewNode(activation_repr())->assert_is_op(activation_type); - // Create variables - auto *elementwise_b = pattern->NewNode(elementwise_b_repr()) - ->AsInput() - ->assert_is_op_input(elementwise_type, "Y"); - // intermediate variable, will be removed in the IR after fuse. - auto *elementwise_out_var = - pattern->NewNode(elementwise_out_repr()) - ->AsIntermediate() - ->assert_is_only_output_of_op(elementwise_type) - ->assert_is_op_input(activation_type); - // output - auto *activation_out_var = pattern->NewNode(activation_out_repr()) - ->AsOutput() - ->assert_is_op_output(activation_type); - - elementwise_op->LinksFrom({elementwise_a, elementwise_b}) - .LinksTo({elementwise_out_var}); - activation_op->LinksFrom({elementwise_out_var}).LinksTo({activation_out_var}); - return activation_out_var; + auto *activation_out = pattern->NewNode(activation_out_repr()) + ->AsOutput() + ->assert_is_op_output(activation_type); + preceding_op->LinksTo({preceding_op_out}); + activation_op->LinksFrom({preceding_op_out}).LinksTo({activation_out}); + return activation_out; } PDNode *patterns::SeqConvEltAddRelu::operator()( @@ -1121,44 +1078,6 @@ PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x, return fc_out_var; } -PDNode *patterns::FCActOneDNN::operator()(const std::string &act_type) { - auto *fc = pattern->NewNode(fc_repr())->assert_is_op("fc"); - auto *fc_out = pattern->NewNode(fc_out_repr()) - ->assert_is_op_output("fc", "Out") - ->assert_is_op_input(act_type); - auto *act = - pattern->NewNode(act_repr())->assert_is_op(act_type)->AsIntermediate(); - auto *act_out = pattern->NewNode(act_out_repr()) - ->assert_is_op_output(act_type, "Out") - ->AsOutput(); - - fc->LinksTo({fc_out}); - act->LinksFrom({fc_out}).LinksTo({act_out}); - - return act_out; -} - -PDNode *patterns::SoftplusActivation::operator()(std::string activation_type) { - // Create Operators - auto *softplus_op = - pattern->NewNode(softplus_repr())->assert_is_op("softplus"); - auto *activation_op = - pattern->NewNode(activation_repr())->assert_is_op(activation_type); - // intermediate variable, will be removed in the IR after fuse. - auto *softplus_out = pattern->NewNode(softplus_out_repr()) - ->AsIntermediate() - ->assert_is_only_output_of_op("softplus") - ->assert_is_op_input(activation_type); - // output - auto *activation_out = pattern->NewNode(activation_out_repr()) - ->AsOutput() - ->assert_is_op_output(activation_type); - - softplus_op->LinksTo({softplus_out}); - activation_op->LinksFrom({softplus_out}).LinksTo({activation_out}); - return activation_out; -} - PDNode *patterns::Embedding::operator()(PDNode *x) { x->assert_is_op_input("lookup_table", "Ids"); auto *lookup_table_op = diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index be14ef2dbf3ea..9210cecabe7c6 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -524,49 +524,16 @@ struct ConvBN : public PatternBase { PATTERN_DECL_NODE(bn_saved_variance); }; -// Conv with Activation -// op: conv + activation -// named nodes: -// conv_input, conv_weight, -// conv_out, conv, -// activation_out, activation -struct ConvActivation : public PatternBase { - ConvActivation(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "conv_activation") {} - - PDNode* operator()(PDNode* conv_input, - std::string conv_type = "conv2d", - std::string activation_type = "relu"); +struct OperatorActivation : public PatternBase { + OperatorActivation(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "operator_activation") {} - // declare operator node's name - PATTERN_DECL_NODE(conv); - PATTERN_DECL_NODE(activation); - // declare variable node's name - PATTERN_DECL_NODE(conv_weight); - PATTERN_DECL_NODE(conv_out); - PATTERN_DECL_NODE(activation_out); -}; - -// Elementwise with Activation -// op: elementwise + activation -// named nodes: -// elementwise_a, elementwise_b, -// elementwise_out, elementwise, -// activation_out, activation -struct ElementwiseActivation : public PatternBase { - ElementwiseActivation(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "elementwise_add_activation") {} - - PDNode* operator()(PDNode* elementwise_a, - const std::string& elementwise_type, + PDNode* operator()(const std::string& operator_type, const std::string& activation_type); - // declare operator node's name - PATTERN_DECL_NODE(elementwise); + PATTERN_DECL_NODE(preceding_op); + PATTERN_DECL_NODE(preceding_op_out); PATTERN_DECL_NODE(activation); - // declare variable node's name - PATTERN_DECL_NODE(elementwise_b); - PATTERN_DECL_NODE(elementwise_out); PATTERN_DECL_NODE(activation_out); }; @@ -639,45 +606,6 @@ struct FCMKLDNN : public PatternBase { PATTERN_DECL_NODE(output); }; -// -// \brief Pattern looking for fc and a directly following activation -// operator. -// -// \note Currently only gelu and tanh are supported as an activation -// function. -// Formula: act(fc(x)) -// Op: fc + act -struct FCActOneDNN : public PatternBase { - FCActOneDNN(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "fc_act_onednn") {} - - PDNode* operator()(const std::string& act_type); - - // declare operator node's name - PATTERN_DECL_NODE(fc); - PATTERN_DECL_NODE(act); - PATTERN_DECL_NODE(fc_out); - PATTERN_DECL_NODE(act_out); -}; - -// Fuse softplus with activation -// ops: softplus + activation -// nodes: -// softplus, softplus_out, -// activation, activation_out -struct SoftplusActivation : public PatternBase { - SoftplusActivation(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "softplus_activation") {} - - PDNode* operator()(std::string activation_type); - - // declare operator node's name - PATTERN_DECL_NODE(softplus); - PATTERN_DECL_NODE(activation); - PATTERN_DECL_NODE(softplus_out); - PATTERN_DECL_NODE(activation_out); -}; - // Embedding struct Embedding : public PatternBase { Embedding(PDPattern* pattern, const std::string& name_scope) diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc index 4eefc2987bcb4..8c140e8132489 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/string/pretty_log.h" namespace paddle { @@ -24,61 +25,27 @@ namespace ir { using string::PrettyLogDetail; void ConvActivationMkldnnFusePass::ApplyImpl(Graph* graph) const { - std::vector act_types = {"relu", - "mish", - "swish", - "sqrt", - "hard_swish", - "sigmoid", - "abs", - "gelu", - "relu6", - "clip", - "tanh", - "hard_sigmoid", - "leaky_relu"}; + auto act_types = paddle::platform::GetSupportedActivations(); std::vector conv_types = {"conv2d"}; for (const auto& conv_type : conv_types) for (auto& act_type : act_types) { - std::unordered_map attrs_map; - - if (act_type == "swish") - attrs_map.emplace("beta", "fuse_alpha"); - else if (act_type == "relu6") - attrs_map.emplace("threshold", "fuse_alpha"); - else if (act_type == "hard_sigmoid") { - attrs_map.emplace("slope", "fuse_alpha"); - attrs_map.emplace("offset", "fuse_beta"); - } else if (act_type == "clip") { - attrs_map.emplace("min", "fuse_alpha"); - attrs_map.emplace("max", "fuse_beta"); - } else { - attrs_map.emplace("alpha", "fuse_alpha"); - attrs_map.emplace("beta", "fuse_beta"); - } - FuseConvAct(graph, conv_type, act_type, attrs_map); + FuseConvAct(graph, conv_type, act_type); } } -void ConvActivationMkldnnFusePass::FuseConvAct( - Graph* graph, - const std::string& conv_type, - std::string& act_type, - const std::unordered_map& attrs_map) const { +void ConvActivationMkldnnFusePass::FuseConvAct(Graph* graph, + const std::string& conv_type, + std::string& act_type) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); FusePassBase::Init(conv_type + "_" + act_type + "_mkldnn_fuse_pass", graph); GraphPatternDetector gpd; - auto* conv_input = gpd.mutable_pattern() - ->NewNode("conv_activation_mkldnn_fuse/conv_input") - ->AsInput() - ->assert_is_op_input(conv_type, "Input"); - patterns::ConvActivation conv_act_pattern(gpd.mutable_pattern(), - "conv_activation_mkldnn_fuse"); - conv_act_pattern(conv_input, conv_type, act_type); + patterns::OperatorActivation conv_act_pattern(gpd.mutable_pattern(), + "conv_activation_mkldnn_fuse"); + conv_act_pattern(conv_type, act_type); int found_conv_activation_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, @@ -90,16 +57,16 @@ void ConvActivationMkldnnFusePass::FuseConvAct( return; } - GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, conv_act_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_act_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_act_pattern); - GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, conv_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv, preceding_op, conv_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_out, preceding_op_out, conv_act_pattern); GET_IR_NODE_FROM_SUBGRAPH(activation, activation, conv_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, conv_act_pattern); OpDesc* conv_op = conv->Op(); OpDesc* act_op = activation->Op(); - for (const auto& attrs : attrs_map) { + auto attr_map = paddle::platform::GetAttributeMap(act_type); + for (const auto& attrs : attr_map) { if (act_op->HasAttr(attrs.first)) { conv_op->SetAttr(attrs.second, act_op->GetAttr(attrs.first)); } diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h index e1e2898384609..11925e1992df4 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h @@ -31,11 +31,9 @@ class ConvActivationMkldnnFusePass : public FusePassBase { protected: void ApplyImpl(Graph *graph) const override; - void FuseConvAct( - Graph *graph, - const std::string &conv_type, - std::string &act_type, - const std::unordered_map &attrs_map) const; + void FuseConvAct(Graph *graph, + const std::string &conv_type, + std::string &act_type) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc index a96ce5e297a87..c9eee31606cc3 100644 --- a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/string/pretty_log.h" namespace paddle { @@ -26,71 +27,40 @@ namespace ir { using string::PrettyLogDetail; void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const { - std::vector act_types = {"relu", - "tanh", - "leaky_relu", - "swish", - "hard_swish", - "sqrt", - "abs", - "clip", - "gelu", - "relu6", - "sigmoid"}; + auto act_types = paddle::platform::GetSupportedActivations(); std::vector elt_types = { "elementwise_add", "elementwise_sub", "elementwise_mul"}; for (const auto &elt_type : elt_types) for (const auto &act_type : act_types) { - std::unordered_map attr_map; - - if (act_type == "swish") - attr_map.emplace("beta", "activation_alpha"); - else if (act_type == "relu6") - attr_map.emplace("threshold", "activation_alpha"); - else if (act_type == "clip") { - attr_map.emplace("min", "activation_alpha"); - attr_map.emplace("max", "activation_beta"); - } else { - attr_map.emplace("alpha", "activation_alpha"); - attr_map.emplace("beta", "activation_beta"); - } - FuseElementwiseAct(graph, elt_type, act_type, attr_map); + FuseElementwiseAct(graph, elt_type, act_type); } } void ElementwiseActivationOneDNNPass::FuseElementwiseAct( Graph *graph, const std::string &elt_type, - const std::string &act_type, - const std::unordered_map &attr_map) const { + const std::string &act_type) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); FusePassBase::Init(elt_type + "_" + act_type + "_mkldnn_fuse_pass", graph); GraphPatternDetector gpd; - auto *elementwise_input = gpd.mutable_pattern() - ->NewNode(elt_type + "_act/elementwise_input") - ->AsInput() - ->assert_is_op_input(elt_type, "X"); - patterns::ElementwiseActivation elementwise_act_pattern(gpd.mutable_pattern(), - elt_type + "_act"); - elementwise_act_pattern(elementwise_input, elt_type, act_type); + patterns::OperatorActivation elementwise_act_pattern(gpd.mutable_pattern(), + elt_type + "_act"); + elementwise_act_pattern(elt_type, act_type); int found_elementwise_activation_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { VLOG(4) << "Fuse " << elt_type << " with activation op."; - // Elementwise output - GET_IR_NODE_FROM_SUBGRAPH( - elementwise_out, elementwise_out, elementwise_act_pattern); - // ACT output GET_IR_NODE_FROM_SUBGRAPH( - activation_out, activation_out, elementwise_act_pattern); - // ops + elementwise, preceding_op, elementwise_act_pattern); GET_IR_NODE_FROM_SUBGRAPH( - elementwise, elementwise, elementwise_act_pattern); + elementwise_out, preceding_op_out, elementwise_act_pattern); GET_IR_NODE_FROM_SUBGRAPH(activation, activation, elementwise_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH( + activation_out, activation_out, elementwise_act_pattern); auto *elementwise_op = elementwise->Op(); @@ -106,6 +76,7 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct( } auto *activation_op = activation->Op(); + auto attr_map = paddle::platform::GetAttributeMap(act_type); for (const auto &attr : attr_map) { if (activation_op->HasAttr(attr.first)) { elementwise_op->SetAttr(attr.second, @@ -115,9 +86,9 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct( if (act_type == "gelu" && activation_op->HasAttr("approximate") && BOOST_GET_CONST(bool, activation_op->GetAttr("approximate"))) - elementwise_op->SetAttr("activation_type", std::string("gelu_tanh")); + elementwise_op->SetAttr("fuse_activation", std::string("gelu_tanh")); else - elementwise_op->SetAttr("activation_type", act_type); + elementwise_op->SetAttr("fuse_activation", act_type); elementwise_op->SetOutput("Out", {activation_out->Name()}); @@ -146,14 +117,16 @@ REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass) .LE("elementwise_add", 1) .LE("elementwise_sub", 1) .LE("elementwise_mul", 1) - .LE("relu", 0) - .LE("tanh", 0) - .LE("leaky_relu", 1) - .LE("swish", 0) - .LE("hard_swish", 0) - .LE("sqrt", 0) - .LE("abs", 0) + .EQ("abs", 0) .LE("clip", 1) - .LE("gelu", 0) - .LE("relu6", 0) - .LE("sigmoid", 0)); + .EQ("gelu", 0) + .EQ("hard_sigmoid", 0) + .LE("hard_swish", 0) + .LE("leaky_relu", 1) + .LE("mish", 1) + .EQ("relu", 0) + .EQ("relu6", 0) + .EQ("sigmoid", 0) + .EQ("sqrt", 0) + .EQ("swish", 0) + .EQ("tanh", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h index 8df479e3ddf06..37bd5345ec78f 100644 --- a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h @@ -34,11 +34,9 @@ class ElementwiseActivationOneDNNPass : public FusePassBase { protected: void ApplyImpl(Graph *graph) const override; - void FuseElementwiseAct( - Graph *graph, - const std::string &elt_types, - const std::string &act_types, - const std::unordered_map &attr_map) const; + void FuseElementwiseAct(Graph *graph, + const std::string &elt_types, + const std::string &act_types) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc index 99243ec7d7047..e5031c83aac16 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc @@ -39,20 +39,17 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph, FusePassBase::Init("fc_act", graph); GraphPatternDetector gpd; - patterns::FCActOneDNN fc_act_pattern(gpd.mutable_pattern(), "fc_act"); - fc_act_pattern(act_type); + patterns::OperatorActivation fc_act_pattern(gpd.mutable_pattern(), "fc_act"); + fc_act_pattern("fc", act_type); int found_fc_act_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { VLOG(4) << "Fuse fc with activation op."; - // FC output - GET_IR_NODE_FROM_SUBGRAPH(fc_out, fc_out, fc_act_pattern); - // ACT output - GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, fc_act_pattern); - // ops - GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_act_pattern); - GET_IR_NODE_FROM_SUBGRAPH(act, act, fc_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc, preceding_op, fc_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_out, preceding_op_out, fc_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act, activation, fc_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_out, activation_out, fc_act_pattern); auto *fc_op = fc->Op(); auto *act_op = act->Op(); diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc index 3dd850d886c8e..41e70e529bf73 100644 --- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/string/pretty_log.h" namespace paddle { @@ -26,59 +27,34 @@ namespace ir { using string::PrettyLogDetail; void SoftplusActivationOneDNNPass::ApplyImpl(Graph *graph) const { - std::vector act_types = {"relu", - "tanh", - "leaky_relu", - "swish", - "hardswish", - "sqrt", - "abs", - "clip", - "gelu", - "relu6", - "sigmoid"}; + auto act_types = paddle::platform::GetSupportedActivations(); for (const auto &act_type : act_types) { - std::unordered_map attr_map; - - if (act_type == "swish") - attr_map.emplace("beta", "fuse_activation_alpha"); - else if (act_type == "relu6") - attr_map.emplace("threshold", "fuse_activation_alpha"); - else if (act_type == "clip") { - attr_map.emplace("min", "fuse_activation_alpha"); - attr_map.emplace("max", "fuse_activation_beta"); - } else { - attr_map.emplace("alpha", "fuse_activation_alpha"); - attr_map.emplace("beta", "fuse_activation_beta"); - } - FuseSoftplusActivation(graph, act_type, attr_map); + FuseSoftplusActivation(graph, act_type); } } void SoftplusActivationOneDNNPass::FuseSoftplusActivation( - Graph *graph, - const std::string &fuse_activation_type, - const std::unordered_map &attr_map) const { + Graph *graph, const std::string &act_type) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); FusePassBase::Init("softplus_activation", graph); GraphPatternDetector gpd; - patterns::SoftplusActivation softplus_activation_pattern( + patterns::OperatorActivation softplus_activation_pattern( gpd.mutable_pattern(), "softplus_activation"); - softplus_activation_pattern(fuse_activation_type); + softplus_activation_pattern("softplus", act_type); int found_softplus_activation_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { VLOG(4) << "Fuse softplus with activation op."; GET_IR_NODE_FROM_SUBGRAPH( - softplus_out, softplus_out, softplus_activation_pattern); + softplus_out, preceding_op_out, softplus_activation_pattern); GET_IR_NODE_FROM_SUBGRAPH( activation_out, activation_out, softplus_activation_pattern); - - GET_IR_NODE_FROM_SUBGRAPH(softplus, softplus, softplus_activation_pattern); + GET_IR_NODE_FROM_SUBGRAPH( + softplus, preceding_op, softplus_activation_pattern); GET_IR_NODE_FROM_SUBGRAPH( activation, activation, softplus_activation_pattern); @@ -94,18 +70,18 @@ void SoftplusActivationOneDNNPass::FuseSoftplusActivation( } auto *activation_op = activation->Op(); + auto attr_map = paddle::platform::GetAttributeMap(act_type); for (const auto &attr : attr_map) { if (activation_op->HasAttr(attr.first)) { softplus_op->SetAttr(attr.second, activation_op->GetAttr(attr.first)); } } - if (fuse_activation_type == "gelu" && - activation_op->HasAttr("approximate") && + if (act_type == "gelu" && activation_op->HasAttr("approximate") && BOOST_GET_CONST(bool, activation_op->GetAttr("approximate"))) - softplus_op->SetAttr("fuse_activation_type", std::string("gelu_tanh")); + softplus_op->SetAttr("fuse_activation", std::string("gelu_tanh")); else - softplus_op->SetAttr("fuse_activation_type", fuse_activation_type); + softplus_op->SetAttr("fuse_activation", act_type); softplus_op->SetAttr("use_mkldnn", true); @@ -121,7 +97,7 @@ void SoftplusActivationOneDNNPass::FuseSoftplusActivation( if (!Has("disable_logs") || !Get("disable_logs")) PrettyLogDetail("--- fused %d softplus with %s activation", found_softplus_activation_count, - fuse_activation_type); + act_type); } } // namespace ir } // namespace framework @@ -133,13 +109,16 @@ REGISTER_PASS_CAPABILITY(softplus_activation_mkldnn_fuse_pass) .AddCombination( paddle::framework::compatible::OpVersionComparatorCombination() .LE("softplus", 1) - .EQ("relu", 0) - .EQ("tanh", 0) - .LE("leaky_relu", 1) - .EQ("swish", 0) - .EQ("hard_swish", 0) - .EQ("sqrt", 0) .EQ("abs", 0) - .LE("relu6", 1) .LE("clip", 1) - .EQ("gelu", 0)); + .EQ("gelu", 0) + .EQ("hard_sigmoid", 0) + .LE("hard_swish", 0) + .LE("leaky_relu", 1) + .LE("mish", 1) + .EQ("relu", 0) + .EQ("relu6", 0) + .EQ("sigmoid", 0) + .EQ("sqrt", 0) + .EQ("swish", 0) + .EQ("tanh", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h index c49502c674355..6368a102b0e85 100644 --- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass.h @@ -34,10 +34,8 @@ class SoftplusActivationOneDNNPass : public FusePassBase { protected: void ApplyImpl(ir::Graph *graph) const override; - void FuseSoftplusActivation( - ir::Graph *graph, - const std::string &fuse_activation_type, - const std::unordered_map &attr_map) const; + void FuseSoftplusActivation(ir::Graph *graph, + const std::string &act_type) const; }; } // namespace ir diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 7f6566460ab62..42d749b7b8e3e 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -50,22 +50,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { private: dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const { dnnl::post_ops post_operations; - if (ctx.HasAttr("activation_type")) { - const float scale = ctx.HasAttr("activation_scale") - ? ctx.Attr("activation_scale") - : 1.0f; - const float alpha = ctx.HasAttr("activation_alpha") - ? ctx.Attr("activation_alpha") - : 0.0f; - const float beta = ctx.HasAttr("activation_beta") - ? ctx.Attr("activation_beta") - : 0.0f; - - const auto activation_algorithm = platform::AcquireActivationAlgorithm( - ctx.Attr("activation_type")); - - post_operations.append_eltwise(scale, activation_algorithm, alpha, beta); - } + platform::AppendActivation(ctx, post_operations); return post_operations; } diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 17d4c2fad96b8..8ee97c281e3f4 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -553,10 +553,6 @@ class ConvMKLDNNHandlerT dnnl::primitive_attr conv_attr; dnnl::post_ops post_operations; - const std::string fuse_activation = - ctx.Attr("fuse_activation"); - const float fuse_alpha = ctx.Attr("fuse_alpha"); - const float fuse_beta = ctx.Attr("fuse_beta"); const bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); float sum_scale = 1.0f; @@ -587,19 +583,7 @@ class ConvMKLDNNHandlerT post_operations.append_sum(sum_scale); } - if (fuse_activation == "hard_sigmoid") { - post_operations.append_eltwise(activation_scale, - dnnl::algorithm::eltwise_linear, - fuse_alpha, - fuse_beta); - post_operations.append_eltwise( - activation_scale, dnnl::algorithm::eltwise_clip, 0.0f, 1.0f); - } else if (fuse_activation != "") { - const auto activation_algorithm = - platform::AcquireActivationAlgorithm(fuse_activation); - post_operations.append_eltwise( - activation_scale, activation_algorithm, fuse_alpha, fuse_beta); - } + platform::AppendActivation(ctx, post_operations, activation_scale); conv_attr.set_post_ops(post_operations); return conv_attr; diff --git a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h index d2aa1cfc6bbf7..c41864ee26f55 100644 --- a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h +++ b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h @@ -46,7 +46,7 @@ class SoftplusMKLDNNHandler 1.0f, dnnl::algorithm::eltwise_linear, 1.0f / beta, 0.0f); } - AppendFusedActivationIfExists(ctx, &post_ops); + platform::AppendActivation(ctx, post_ops); dnnl::primitive_attr attrs; attrs.set_post_ops(post_ops); @@ -62,42 +62,8 @@ class SoftplusMKLDNNHandler return this->AcquireMemoryFromPrimitive( this->fwd_pd_->src1_desc(), platform::to_void_cast(beta)); } - - private: - void AppendFusedActivationIfExists(const framework::ExecutionContext& ctx, - dnnl::post_ops* post_ops) { - const auto& fused_activation_type = - algo_map.find(ctx.Attr("fuse_activation_type")); - - if (fused_activation_type != algo_map.end()) { - auto scale_out = - ctx.Attr("fuse_activation_scale"); // for future int8 support - post_ops->append_eltwise(scale_out, - fused_activation_type->second, - ctx.Attr("fuse_activation_alpha"), - ctx.Attr("fuse_activation_beta")); - } - } - - static const std::unordered_map algo_map; }; -template -const std::unordered_map - SoftplusMKLDNNHandler::algo_map = { - {"relu", dnnl::algorithm::eltwise_relu}, - {"tanh", dnnl::algorithm::eltwise_tanh}, - {"leaky_relu", dnnl::algorithm::eltwise_relu}, - {"swish", dnnl::algorithm::eltwise_swish}, - {"hardswish", dnnl::algorithm::eltwise_hardswish}, - {"sqrt", dnnl::algorithm::eltwise_sqrt}, - {"abs", dnnl::algorithm::eltwise_abs}, - {"clip", dnnl::algorithm::eltwise_clip}, - {"gelu", dnnl::algorithm::eltwise_gelu_erf}, - {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, - {"relu6", dnnl::algorithm::eltwise_bounded_relu}, - {"sigmoid", dnnl::algorithm::eltwise_logistic}}; - template void custom_softplus_eltwise_forward(const framework::ExecutionContext& ctx) { const auto& dev_ctx = diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 2f4bbfaf74fcc..f1963a75b1729 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -1013,32 +1013,93 @@ class ActivationMKLDNNHandler } }; -static const dnnl::algorithm AcquireActivationAlgorithm( - std::string activation_name) { - std::unordered_map activation_map = { - {"abs", dnnl::algorithm::eltwise_abs}, - {"clip", dnnl::algorithm::eltwise_clip}, - {"gelu", dnnl::algorithm::eltwise_gelu_erf}, - {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf}, - {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, - {"hard_swish", dnnl::algorithm::eltwise_hardswish}, - {"leaky_relu", dnnl::algorithm::eltwise_relu}, - {"mish", dnnl::algorithm::eltwise_mish}, - {"relu", dnnl::algorithm::eltwise_relu}, - {"relu6", dnnl::algorithm::eltwise_bounded_relu}, - {"sigmoid", dnnl::algorithm::eltwise_logistic}, - {"sqrt", dnnl::algorithm::eltwise_sqrt}, - {"swish", dnnl::algorithm::eltwise_swish}, - {"tanh", dnnl::algorithm::eltwise_tanh}}; - - const auto& activation_type = activation_map.find(activation_name); - - PADDLE_ENFORCE_NE(activation_type, - activation_map.end(), - platform::errors::InvalidArgument( - "Activation '%s' not found in oneDNN algorithms mapper", - activation_name)); - return activation_type->second; +static void AppendActivation(const framework::ExecutionContext& ctx, + dnnl::post_ops& post_ops, + float activation_scale = 1.0f) { + const auto invalid_attribute = + ctx.HasAttr("fuse_activation") + ? ctx.Attr("fuse_activation").empty() + : true; + if (invalid_attribute) return; + + const auto fuse_activation = ctx.Attr("fuse_activation"); + const auto fuse_alpha = + ctx.HasAttr("fuse_alpha") ? ctx.Attr("fuse_alpha") : 0.0f; + const auto fuse_beta = + ctx.HasAttr("fuse_beta") ? ctx.Attr("fuse_beta") : 0.0f; + + if (fuse_activation == "hard_sigmoid") { + post_ops.append_eltwise(activation_scale, + dnnl::algorithm::eltwise_linear, + fuse_alpha, + fuse_beta); + post_ops.append_eltwise( + activation_scale, dnnl::algorithm::eltwise_clip, 0.0f, 1.0f); + } else { + const std::unordered_map activation_map = { + {"abs", dnnl::algorithm::eltwise_abs}, + {"clip", dnnl::algorithm::eltwise_clip}, + {"gelu", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_erf", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, + {"hard_swish", dnnl::algorithm::eltwise_hardswish}, + {"leaky_relu", dnnl::algorithm::eltwise_relu}, + {"mish", dnnl::algorithm::eltwise_mish}, + {"relu", dnnl::algorithm::eltwise_relu}, + {"relu6", dnnl::algorithm::eltwise_bounded_relu}, + {"sigmoid", dnnl::algorithm::eltwise_logistic}, + {"sqrt", dnnl::algorithm::eltwise_sqrt}, + {"swish", dnnl::algorithm::eltwise_swish}, + {"tanh", dnnl::algorithm::eltwise_tanh}}; + + const auto& activation_type = activation_map.find(fuse_activation); + + PADDLE_ENFORCE_NE( + activation_type, + activation_map.end(), + platform::errors::InvalidArgument( + "Activation '%s' not found in oneDNN algorithms mapper", + fuse_activation)); + + post_ops.append_eltwise( + activation_scale, activation_type->second, fuse_alpha, fuse_beta); + } +} + +static std::unordered_map GetAttributeMap( + std::string act_type) { + std::unordered_map attr_map; + if (act_type == "swish") + attr_map.emplace("beta", "fuse_alpha"); + else if (act_type == "relu6") + attr_map.emplace("threshold", "fuse_alpha"); + else if (act_type == "hard_sigmoid") { + attr_map.emplace("slope", "fuse_alpha"); + attr_map.emplace("offset", "fuse_beta"); + } else if (act_type == "clip") { + attr_map.emplace("min", "fuse_alpha"); + attr_map.emplace("max", "fuse_beta"); + } else { + attr_map.emplace("alpha", "fuse_alpha"); + attr_map.emplace("beta", "fuse_beta"); + } + return attr_map; +} + +static std::vector GetSupportedActivations() { + return std::vector{"abs", + "clip", + "gelu", + "hard_sigmoid", + "hard_swish", + "leaky_relu", + "mish", + "relu", + "relu6", + "sigmoid", + "sqrt", + "swish", + "tanh"}; } class ReorderMKLDNNHandler { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py index 0c25a790138cd..5e5dd4c719d98 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_softplus_activation_fuse_pass.py @@ -23,8 +23,8 @@ class SoftplusActivationReluOneDNNFusePassTest(InferencePassTest): - fuse_activation_alpha = None - fuse_activation_beta = None + fuse_alpha = None + fuse_beta = None pass_name = 'softplus_activation_mkldnn_fuse_pass' def setUp(self): @@ -34,13 +34,13 @@ def setUp(self): shape=[-1, 3, 100, 100], dtype="float32") softplus_out = fluid.layers.softplus(data) - if self.fuse_activation_beta is not None: - activation_out = self.fuse_activation( - softplus_out, self.fuse_activation_alpha, - self.fuse_activation_beta) - elif self.fuse_activation_alpha is not None: - activation_out = self.fuse_activation( - softplus_out, self.fuse_activation_alpha) + if self.fuse_beta is not None: + activation_out = self.fuse_activation(softplus_out, + self.fuse_alpha, + self.fuse_beta) + elif self.fuse_alpha is not None: + activation_out = self.fuse_activation(softplus_out, + self.fuse_alpha) else: activation_out = self.fuse_activation(softplus_out) @@ -73,7 +73,7 @@ class SoftplusActivationLeakyReluOneDNNFusePassTest( def set_params(self): self.fuse_activation = fluid.layers.leaky_relu - self.fuse_activation_alpha = 0.3 + self.fuse_alpha = 0.3 class SoftplusActivationSwishOneDNNFusePassTest( @@ -81,7 +81,7 @@ class SoftplusActivationSwishOneDNNFusePassTest( def set_params(self): self.fuse_activation = fluid.layers.swish - self.fuse_activation_alpha = 3 + self.fuse_alpha = 3 class SoftplusActivationHardSwishOneDNNFusePassTest( @@ -110,8 +110,8 @@ class SoftplusActivationClipOneDNNFusePassTest( def set_params(self): self.fuse_activation = fluid.layers.clip - self.fuse_activation_alpha = 1.1 - self.fuse_activation_beta = 5.2 + self.fuse_alpha = 1.1 + self.fuse_beta = 5.2 class SoftplusActivationGeluErfOneDNNFusePassTest( @@ -126,7 +126,7 @@ class SoftplusActivationGeluTanhOneDNNFusePassTest( def set_params(self): self.fuse_activation = fluid.layers.gelu - self.fuse_activation_alpha = True # simulated "Approximate" attr + self.fuse_alpha = True # simulated "Approximate" attr class SoftplusActivationRelu6OneDNNFusePassTest( From 449ea33de2961bce799479a681d2e4a42690dd88 Mon Sep 17 00:00:00 2001 From: wangguanqun Date: Mon, 11 Jul 2022 17:03:23 +0800 Subject: [PATCH 120/250] [GPUPS]SSDSparseTable add PullSparsePtr (#44137) * ssd pullsparseptr * update codestyle --- .../distributed/ps/table/ssd_sparse_table.cc | 215 +++++++++++++++++- .../distributed/ps/table/ssd_sparse_table.h | 24 +- 2 files changed, 215 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc index 071a1703e2a6d..3e0f631ed41bc 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.cc @@ -39,6 +39,33 @@ int32_t SSDSparseTable::Initialize() { int32_t SSDSparseTable::InitializeShard() { return 0; } +int32_t SSDSparseTable::Pull(TableContext& context) { + CHECK(context.value_type == Sparse); + if (context.use_ptr) { + char** pull_values = context.pull_context.ptr_values; + const uint64_t* keys = context.pull_context.keys; + return PullSparsePtr(pull_values, keys, context.num); + } else { + float* pull_values = context.pull_context.values; + const PullSparseValue& pull_value = context.pull_context.pull_value; + return PullSparse(pull_values, pull_value.feasigns_, pull_value.numel_); + } +} + +int32_t SSDSparseTable::Push(TableContext& context) { + CHECK(context.value_type == Sparse); + if (context.use_ptr) { + return PushSparse(context.push_context.keys, + context.push_context.ptr_values, + context.num); + } else { + const uint64_t* keys = context.push_context.keys; + const float* values = context.push_context.values; + size_t num = context.num; + return PushSparse(keys, values, num); + } +} + int32_t SSDSparseTable::PullSparse(float* pull_values, const uint64_t* keys, size_t num) { @@ -73,7 +100,7 @@ int32_t SSDSparseTable::PullSparse(float* pull_values, &missed_keys]() -> int { auto& keys = task_keys[shard_id]; auto& local_shard = _local_shards[shard_id]; - float data_buffer[value_size]; + float data_buffer[value_size]; // NOLINT float* data_buffer_ptr = data_buffer; for (size_t i = 0; i < keys.size(); ++i) { uint64_t key = keys[i].first; @@ -83,7 +110,7 @@ int32_t SSDSparseTable::PullSparse(float* pull_values, // pull rocksdb std::string tmp_string(""); if (_db->get(shard_id, - (char*)&key, + reinterpret_cast(&key), sizeof(uint64_t), tmp_string) > 0) { ++missed_keys; @@ -110,7 +137,9 @@ int32_t SSDSparseTable::PullSparse(float* pull_values, memcpy(const_cast(feature_value.data()), data_buffer_ptr, data_size * sizeof(float)); - _db->del_data(shard_id, (char*)&key, sizeof(uint64_t)); + _db->del_data(shard_id, + reinterpret_cast(&key), + sizeof(uint64_t)); } } else { data_size = itr.value().size(); @@ -142,6 +171,95 @@ int32_t SSDSparseTable::PullSparse(float* pull_values, return 0; } +int32_t SSDSparseTable::PullSparsePtr(char** pull_values, + const uint64_t* keys, + size_t num) { + CostTimer timer("pserver_ssd_sparse_select_all"); + size_t value_size = _value_accesor->GetAccessorInfo().size / sizeof(float); + size_t mf_value_size = + _value_accesor->GetAccessorInfo().mf_size / sizeof(float); + + { // 从table取值 or create + std::vector> tasks(_real_local_shard_num); + std::vector>> task_keys( + _real_local_shard_num); + for (size_t i = 0; i < num; ++i) { + int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num; + task_keys[shard_id].push_back({keys[i], i}); + } + + std::atomic missed_keys{0}; + for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) { + tasks[shard_id] = + _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue( + [this, + shard_id, + &task_keys, + value_size, + mf_value_size, + pull_values, + &missed_keys]() -> int { + auto& keys = task_keys[shard_id]; + auto& local_shard = _local_shards[shard_id]; + float data_buffer[value_size]; // NOLINT + float* data_buffer_ptr = data_buffer; + for (size_t i = 0; i < keys.size(); ++i) { + uint64_t key = keys[i].first; + auto itr = local_shard.find(key); + size_t data_size = value_size - mf_value_size; + FixedFeatureValue* ret = NULL; + if (itr == local_shard.end()) { + // pull rocksdb + std::string tmp_string(""); + if (_db->get(shard_id, + reinterpret_cast(&key), + sizeof(uint64_t), + tmp_string) > 0) { + ++missed_keys; + auto& feature_value = local_shard[key]; + feature_value.resize(data_size); + float* data_ptr = + const_cast(feature_value.data()); + _value_accesor->Create(&data_buffer_ptr, 1); + memcpy( + data_ptr, data_buffer_ptr, data_size * sizeof(float)); + ret = &feature_value; + } else { + data_size = tmp_string.size() / sizeof(float); + memcpy(data_buffer_ptr, + paddle::string::str_to_float(tmp_string), + data_size * sizeof(float)); + // from rocksdb to mem + auto& feature_value = local_shard[key]; + feature_value.resize(data_size); + memcpy(const_cast(feature_value.data()), + data_buffer_ptr, + data_size * sizeof(float)); + _db->del_data(shard_id, + reinterpret_cast(&key), + sizeof(uint64_t)); + ret = &feature_value; + } + } else { + ret = itr.value_ptr(); + } + int pull_data_idx = keys[i].second; + pull_values[pull_data_idx] = reinterpret_cast(ret); + } + return 0; + }); + } + for (int i = 0; i < _real_local_shard_num; ++i) { + tasks[i].wait(); + } + if (FLAGS_pserver_print_missed_key_num_every_push) { + LOG(WARNING) << "total pull keys:" << num + << " missed_keys:" << missed_keys.load(); + } + } + return 0; +} + int32_t SSDSparseTable::PushSparse(const uint64_t* keys, const float* values, size_t num) { @@ -172,7 +290,7 @@ int32_t SSDSparseTable::PushSparse(const uint64_t* keys, &task_keys]() -> int { auto& keys = task_keys[shard_id]; auto& local_shard = _local_shards[shard_id]; - float data_buffer[value_col]; + float data_buffer[value_col]; // NOLINT float* data_buffer_ptr = data_buffer; for (size_t i = 0; i < keys.size(); ++i) { uint64_t key = keys[i].first; @@ -201,7 +319,8 @@ int32_t SSDSparseTable::PushSparse(const uint64_t* keys, if (value_size == value_col) { // 已拓展到最大size, 则就地update _value_accesor->Update(&value_data, &update_data, 1); - } else { // 拷入buffer区进行update,然后再回填,不需要的mf则回填时抛弃了 + } else { + // 拷入buffer区进行update,然后再回填,不需要的mf则回填时抛弃了 memcpy(data_buffer_ptr, value_data, value_size * sizeof(float)); @@ -247,6 +366,90 @@ int32_t SSDSparseTable::PushSparse(const uint64_t* keys, return 0; } +int32_t SSDSparseTable::PushSparse(const uint64_t* keys, + const float** values, + size_t num) { + CostTimer timer("pserver_downpour_sparse_update_all"); + // 构造value push_value的数据指针 + size_t value_col = _value_accesor->GetAccessorInfo().size / sizeof(float); + size_t mf_value_col = + _value_accesor->GetAccessorInfo().mf_size / sizeof(float); + size_t update_value_col = + _value_accesor->GetAccessorInfo().update_size / sizeof(float); + { + std::vector> tasks(_real_local_shard_num); + std::vector>> task_keys( + _real_local_shard_num); + for (size_t i = 0; i < num; ++i) { + int shard_id = (keys[i] % _sparse_table_shard_num) % _avg_local_shard_num; + task_keys[shard_id].push_back({keys[i], i}); + } + for (int shard_id = 0; shard_id < _real_local_shard_num; ++shard_id) { + tasks[shard_id] = + _shards_task_pool[shard_id % _shards_task_pool.size()]->enqueue( + [this, + shard_id, + value_col, + mf_value_col, + update_value_col, + values, + &task_keys]() -> int { + auto& keys = task_keys[shard_id]; + auto& local_shard = _local_shards[shard_id]; + float data_buffer[value_col]; // NOLINT + float* data_buffer_ptr = data_buffer; + for (size_t i = 0; i < keys.size(); ++i) { + uint64_t key = keys[i].first; + uint64_t push_data_idx = keys[i].second; + const float* update_data = values[push_data_idx]; + auto itr = local_shard.find(key); + if (itr == local_shard.end()) { + if (FLAGS_pserver_enable_create_feasign_randomly && + !_value_accesor->CreateValue(1, update_data)) { + continue; + } + auto value_size = value_col - mf_value_col; + auto& feature_value = local_shard[key]; + feature_value.resize(value_size); + _value_accesor->Create(&data_buffer_ptr, 1); + memcpy(const_cast(feature_value.data()), + data_buffer_ptr, + value_size * sizeof(float)); + itr = local_shard.find(key); + } + auto& feature_value = itr.value(); + float* value_data = const_cast(feature_value.data()); + size_t value_size = feature_value.size(); + + if (value_size == + value_col) { // 已拓展到最大size, 则就地update + _value_accesor->Update(&value_data, &update_data, 1); + } else { + // 拷入buffer区进行update,然后再回填,不需要的mf则回填时抛弃了 + memcpy(data_buffer_ptr, + value_data, + value_size * sizeof(float)); + _value_accesor->Update(&data_buffer_ptr, &update_data, 1); + if (_value_accesor->NeedExtendMF(data_buffer)) { + feature_value.resize(value_col); + value_data = const_cast(feature_value.data()); + _value_accesor->Create(&value_data, 1); + } + memcpy(value_data, + data_buffer_ptr, + value_size * sizeof(float)); + } + } + return 0; + }); + } + for (int i = 0; i < _real_local_shard_num; ++i) { + tasks[i].wait(); + } + } + return 0; +} + int32_t SSDSparseTable::Shrink(const std::string& param) { int thread_num = _real_local_shard_num < 20 ? _real_local_shard_num : 20; omp_set_num_threads(thread_num); @@ -282,7 +485,7 @@ int32_t SSDSparseTable::Shrink(const std::string& param) { delete it; LOG(INFO) << "SSDSparseTable shrink success. shard:" << i << " delete MEM[" << mem_count << "] SSD[" << ssd_count << "]"; - //_db->flush(i); + // _db->flush(i); } return 0; } diff --git a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h index 5b38e4b3d73f7..55a05bbab5ec2 100644 --- a/paddle/fluid/distributed/ps/table/ssd_sparse_table.h +++ b/paddle/fluid/distributed/ps/table/ssd_sparse_table.h @@ -33,26 +33,14 @@ class SSDSparseTable : public MemorySparseTable { // exchange data int32_t UpdateTable(); - int32_t Pull(TableContext& context) override { - CHECK(context.value_type == Sparse); - float* pull_values = context.pull_context.values; - const PullSparseValue& pull_value = context.pull_context.pull_value; - return PullSparse(pull_values, pull_value.feasigns_, pull_value.numel_); - } + int32_t Pull(TableContext& context) override; - int32_t Push(TableContext& context) override { - const uint64_t* keys = context.push_context.keys; - const float* values = context.push_context.values; - size_t num = context.num; - return PushSparse(keys, values, num); - } + int32_t Push(TableContext& context) override; - virtual int32_t PullSparse(float* pull_values, - const uint64_t* keys, - size_t num); - virtual int32_t PushSparse(const uint64_t* keys, - const float* values, - size_t num); + int32_t PullSparse(float* pull_values, const uint64_t* keys, size_t num); + int32_t PullSparsePtr(char** pull_values, const uint64_t* keys, size_t num); + int32_t PushSparse(const uint64_t* keys, const float* values, size_t num); + int32_t PushSparse(const uint64_t* keys, const float** values, size_t num); int32_t Flush() override { return 0; } virtual int32_t Shrink(const std::string& param) override; From 6a7dfdd0c574a90a31241cdebf11f2f380ff5148 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Mon, 11 Jul 2022 17:04:05 +0800 Subject: [PATCH 121/250] einsum support complex (#44212) einsum support complex and add unittest. --- paddle/phi/kernels/cpu/einsum_kernel.cc | 10 ++++++++-- paddle/phi/kernels/gpu/einsum_kernel.cu | 4 +++- .../paddle/fluid/tests/unittests/test_einsum_v2.py | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/paddle/phi/kernels/cpu/einsum_kernel.cc b/paddle/phi/kernels/cpu/einsum_kernel.cc index 401d2fd158a5d..901c1fed628d3 100644 --- a/paddle/phi/kernels/cpu/einsum_kernel.cc +++ b/paddle/phi/kernels/cpu/einsum_kernel.cc @@ -18,5 +18,11 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/einsum_impl.h" -PD_REGISTER_KERNEL( - einsum, CPU, ALL_LAYOUT, phi::EinsumKernelRaw, float, double) {} +PD_REGISTER_KERNEL(einsum, + CPU, + ALL_LAYOUT, + phi::EinsumKernelRaw, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/einsum_kernel.cu b/paddle/phi/kernels/gpu/einsum_kernel.cu index d1f4c6590387a..b3706710c40e3 100644 --- a/paddle/phi/kernels/gpu/einsum_kernel.cu +++ b/paddle/phi/kernels/gpu/einsum_kernel.cu @@ -25,4 +25,6 @@ PD_REGISTER_KERNEL(einsum, float, double, phi::dtype::float16, - phi::dtype::bfloat16) {} + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/python/paddle/fluid/tests/unittests/test_einsum_v2.py b/python/paddle/fluid/tests/unittests/test_einsum_v2.py index 97f3eef51a5bf..224f44d74864b 100644 --- a/python/paddle/fluid/tests/unittests/test_einsum_v2.py +++ b/python/paddle/fluid/tests/unittests/test_einsum_v2.py @@ -541,5 +541,19 @@ def test_shape(self): self.assertEqual(C.item(), 8.0) +class TestComplex(unittest.TestCase): + """ + EinsumOp support Complex type + """ + + def test_shape(self): + a = paddle.rand([4, 4]) + b = paddle.rand([4, 4]) + c = paddle.einsum('xy,yz->xz', a, b) + a = paddle.cast(a, 'complex64') + b = paddle.cast(b, 'complex64') + c = paddle.einsum('xy,yz->xz', a, b) + + if __name__ == "__main__": unittest.main() From f98b8dbc4d6904650ae1c1929386880cfbb4ba73 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Mon, 11 Jul 2022 17:25:24 +0800 Subject: [PATCH 122/250] make linux link both paddle_inference_shared and paddle_inference_c to aoivid size increase (#44180) --- paddle/fluid/inference/capi/CMakeLists.txt | 4 + .../fluid/inference/capi_exp/CMakeLists.txt | 4 + .../fluid/inference/tests/api/CMakeLists.txt | 96 ++++++++++--------- 3 files changed, 60 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/inference/capi/CMakeLists.txt b/paddle/fluid/inference/capi/CMakeLists.txt index 25d8a39dc6374..c6ee6bab3c776 100644 --- a/paddle/fluid/inference/capi/CMakeLists.txt +++ b/paddle/fluid/inference/capi/CMakeLists.txt @@ -20,6 +20,10 @@ cc_library( SRCS ${C_API_SRCS} DEPS paddle_inference) +if(NOT ON_INFER AND NOT WIN32) + return() +endif() + # Create inference capi shared library cc_library( paddle_inference_c_shared SHARED diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt index 56de57cbb9c85..089a766b91cfe 100644 --- a/paddle/fluid/inference/capi_exp/CMakeLists.txt +++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt @@ -20,6 +20,10 @@ cc_library( SRCS ${C_API_SRCS} DEPS paddle_inference) +if(NOT ON_INFER AND NOT WIN32) + return() +endif() + # Create inference capi shared library cc_library( paddle_inference_c_shared SHARED diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 610883ad1ad27..4463a949948d8 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -3,7 +3,15 @@ if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") return() endif() -set(INFERENCE_EXTRA_DEPS paddle_inference_shared) +# In Windows, c_api test link must link both 2 shared to avoid symbols redefinition, +# in Linux, c_api test cant do like this or graph_to_program register more than once. +# Both Windows and Linux can only use paddle_inference_c, but this will increase size +# of build folder by 30G. +if(WIN32) + set(INFERENCE_C_EXTRA_DEPS paddle_inference_shared paddle_inference_c_shared) +else() + set(INFERENCE_C_EXTRA_DEPS paddle_inference_shared paddle_inference_c) +endif() function(download_data install_dir data_file check_sum) string(REGEX MATCH "[^/\\]+$" file_name ${data_file}) @@ -103,7 +111,7 @@ function(inference_analysis_api_test target install_dir filename) SRCS ${filename} EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt @@ -116,7 +124,7 @@ function(inference_analysis_api_int8_test target install_dir filename) SRCS ${filename} EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt @@ -133,7 +141,7 @@ function(inference_multiple_models_analysis_api_test target install_dir SRCS ${filename} EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${install_dir}/mobilenet_v2_models/1 --infer_model2=${install_dir}/mobilenet_v2_models/xx @@ -142,7 +150,7 @@ endfunction() function(inference_analysis_api_test_build TARGET_NAME filename) inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS}) + paddle_inference_shared) endfunction() function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir @@ -201,7 +209,7 @@ endfunction() function(inference_analysis_api_test_with_fake_data_build TARGET_NAME filename) inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS}) + paddle_inference_shared) endfunction() function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary @@ -369,7 +377,7 @@ inference_analysis_test( SRCS analyzer_dam_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt) @@ -430,7 +438,7 @@ inference_analysis_test( SRCS analyzer_ernie_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt @@ -474,7 +482,7 @@ inference_analysis_test( SRCS analyzer_transformer_compare_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt @@ -485,7 +493,7 @@ inference_analysis_test( SRCS analyzer_transformer_fuse_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt @@ -496,7 +504,7 @@ inference_analysis_test( SRCS analyzer_transformer_profile_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt @@ -514,7 +522,7 @@ inference_analysis_test( SRCS analyzer_vit_ocr_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${VIT_OCR_INSTALL_DIR}/vit_ocr/model --infer_data=${VIT_OCR_INSTALL_DIR}/vit_ocr/datavit.txt) @@ -537,7 +545,7 @@ inference_analysis_test( SRCS analyzer_detect_functional_mkldnn_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt @@ -895,7 +903,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_mobilenet_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) inference_analysis_test( @@ -903,7 +911,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_resnet50_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) inference_analysis_test( @@ -911,7 +919,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_resnext_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) inference_analysis_test( @@ -919,7 +927,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_fc_prelu_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) inference_analysis_test( @@ -927,7 +935,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_cascade_rcnn_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) inference_analysis_test( @@ -935,7 +943,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_split_converter_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/) inference_analysis_test( @@ -943,7 +951,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS analyzer_capi_exp_gpu_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + ${INFERENCE_C_EXTRA_DEPS} ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) if(WIN32) @@ -956,7 +964,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS analyzer_capi_exp_xpu_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + ${INFERENCE_C_EXTRA_DEPS} ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) if(WIN32) @@ -977,7 +985,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_quant_int8_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR}) @@ -993,7 +1001,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_quant_int8_yolov3_r50_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR}) @@ -1016,7 +1024,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_dynamic_shape_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}) @@ -1032,7 +1040,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_dynamic_shape_ernie_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4) @@ -1049,7 +1057,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_dynamic_shape_transformer_prune_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune) @@ -1064,7 +1072,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_dynamic_shape_ernie_serialize_deserialize_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) @@ -1079,7 +1087,7 @@ if(WITH_GPU AND TENSORRT_FOUND) SRCS trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized) @@ -1093,7 +1101,7 @@ inference_analysis_test( SRCS lite_mul_model_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${LITE_MODEL_INSTALL_DIR}) inference_analysis_test( @@ -1101,7 +1109,7 @@ inference_analysis_test( SRCS lite_resnet50_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${RESNET50_MODEL_DIR}) @@ -1110,7 +1118,7 @@ inference_analysis_test( SRCS analyzer_capi_exp_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + ${INFERENCE_C_EXTRA_DEPS} ARGS --infer_model=${RESNET50_MODEL_DIR}/model) if(WIN32) @@ -1124,7 +1132,7 @@ inference_analysis_test( SRCS analyzer_capi_exp_pd_config_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + ${INFERENCE_C_EXTRA_DEPS} ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) if(WIN32) @@ -1139,7 +1147,7 @@ inference_analysis_test( SRCS analyzer_capi_exp_pd_tensor_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + ${INFERENCE_C_EXTRA_DEPS} ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) if(WIN32) @@ -1155,7 +1163,7 @@ if(NOT APPLE AND NOT WIN32) SRCS analyzer_capi_exp_pd_threads_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + ${INFERENCE_C_EXTRA_DEPS} ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) if(WIN32) @@ -1171,7 +1179,7 @@ inference_analysis_test( SRCS analyzer_zerocopy_tensor_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${OCR_INSTALL_DIR}/model) @@ -1183,7 +1191,7 @@ if(WITH_DISTRIBUTE SRCS analyzer_dist_model_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${OCR_INSTALL_DIR}/model) endif() @@ -1193,7 +1201,7 @@ inference_analysis_test( SRCS analyzer_paddle_tensor_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt @@ -1205,7 +1213,7 @@ if(WITH_MKLDNN) SRCS analyzer_capi_exp_int_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + ${INFERENCE_C_EXTRA_DEPS} ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model) if(WIN32) @@ -1220,7 +1228,7 @@ inference_analysis_test( SRCS analyzer_capi_exp_ner_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + ${INFERENCE_C_EXTRA_DEPS} ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model) if(WIN32) @@ -1235,7 +1243,7 @@ if(WITH_GPU) SRCS paddle_infer_api_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${RESNET50_MODEL_DIR}) @@ -1244,7 +1252,7 @@ if(WITH_GPU) SRCS paddle_infer_api_copy_tensor_tester.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${RESNET50_MODEL_DIR}) set_tests_properties(paddle_infer_api_copy_tensor_tester PROPERTIES TIMEOUT @@ -1316,7 +1324,7 @@ if(WITH_IPU) SRCS ipu_word2vec_sample.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${WORD2VEC_INSTALL_DIR}) @@ -1335,7 +1343,7 @@ if(WITH_IPU) SRCS ipu_resnet50_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true @@ -1345,7 +1353,7 @@ if(WITH_IPU) SRCS ipu_resnet50_fp16_test.cc EXTRA_DEPS - ${INFERENCE_EXTRA_DEPS} + paddle_inference_shared ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true From ab57cbf68f285872035d7e1e63daedd033c21204 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Mon, 11 Jul 2022 18:54:54 +0800 Subject: [PATCH 123/250] [ DocFix ] fix the doc of `def cond` (#44157) * fix the doc of * fix the doc --- python/paddle/fluid/layers/control_flow.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 87010bc616a64..bc1a2c15dd3ac 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -2454,10 +2454,13 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None): true. The default value is ``None`` . false_fn(callable, optional): A callable to be performed if ``pred`` is false. The default value is ``None`` . - return_names: A list of strings to represents the name of returned vars. useful to debug. name(str, optional): The default value is ``None`` . Normally users don't have to set this parameter. For more information, please refer to :ref:`api_guide_Name` . + return_names(sequence of string, optional): The default value is ``None`` . + Normally users don't have to set this parameters. A sequence of strings + to represents the name of returned vars. The structure of sequence must + be same with return values of true_fn and false_fn. Returns: Tensor|list(Tensor)|tuple(Tensor): returns ``true_fn()`` if the From d4372a1e4b0d85d5e252aaffb4bafec21158e1da Mon Sep 17 00:00:00 2001 From: Zuza Gawrysiak Date: Mon, 11 Jul 2022 13:23:20 +0200 Subject: [PATCH 124/250] Quantize shape operator (#44124) * Quantize shape operator * Add shape op to propagate scales pass --- paddle/fluid/framework/ir/graph_pattern_detector.h | 2 +- .../ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc | 3 ++- paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc | 1 + .../framework/ir/mkldnn/cpu_quantize_pass_tester.cc | 10 +++++++--- paddle/fluid/inference/api/mkldnn_quantizer.cc | 2 +- paddle/fluid/inference/api/mkldnn_quantizer_config.cc | 7 +++++++ .../slim/quantization/quant2_int8_mkldnn_pass.py | 6 +++--- 7 files changed, 22 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 9210cecabe7c6..09dd426be2daf 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1056,7 +1056,7 @@ struct ResidualElementwise : public PatternBase { }; // General struct for immutable ops: -// reshape, transpose, slice, nearest-interp +// reshape, transpose, slice, shape, nearest-interp // Forward pass for no weights-op. // immutable_out is a result of the operator. struct Immutable : public PatternBase { diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc index ed4d586b8bb5c..f7ee6a96dce04 100644 --- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc @@ -372,7 +372,7 @@ std::unordered_set ComputePropagateScalesMkldnnPass::UpdateScales( const auto op_name = op_node->Name(); if (scale_immutable_ops.count(op_name)) { std::string input_name; - if (op_name == "slice") { + if (op_name == "slice" || op_name == "shape") { input_name = op_node->Op()->Input("Input")[0]; } else { input_name = op_node->Op()->Input("X")[0]; @@ -445,6 +445,7 @@ void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const { "reshape2", "pool2d", "slice", + "shape", "nearest_interp", "nearest_interp_v2"}; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 26a4478fff683..7cfc3f3336d5f 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -1136,6 +1136,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeImmutable(graph, "reshape2", "X"); QuantizeImmutable(graph, "transpose2", "X"); QuantizeImmutable(graph, "slice", "Input"); + QuantizeImmutable(graph, "shape", "Input"); QuantizeImmutable(graph, "nearest_interp", "X"); QuantizeImmutable(graph, "nearest_interp_v2", "X"); QuantizeElementwise(graph, "elementwise_add"); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 322aa22c6ad14..ec7432e83f874 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -66,7 +66,7 @@ void SetOp(ProgramDesc* prog, type == "nearest_interp" || type == "nearest_interp_v2") { op->SetInput("X", {inputs[0]}); op->SetOutput("Out", {outputs[0]}); - } else if (type == "slice") { + } else if (type == "slice" || type == "shape") { op->SetInput("Input", {inputs[0]}); op->SetOutput("Out", {outputs[0]}); } else if (type == "dropout") { @@ -550,8 +550,12 @@ void TestImmutableOpWithManyOutputs(const std::string tested_op) { SCALE * S8_MAX); } -const std::vector immutables = { - "reshape2", "transpose2", "slice", "nearest_interp", "nearest_interp_v2"}; +const std::vector immutables = {"reshape2", + "transpose2", + "slice", + "shape", + "nearest_interp", + "nearest_interp_v2"}; class TestImmutables : public testing::TestWithParam {}; diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index bca2cde0fc2c6..cef7402e6c061 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -142,7 +142,7 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs( scales_[var_name] = scales_[input_var_name]; } compute_scale = false; - } else if (op->Type() == "slice") { + } else if (op->Type() == "slice" || op->Type() == "shape") { auto input_var_name = op->Input("Input")[0]; PADDLE_ENFORCE_NE(scales_.find(input_var_name), scales_.end(), diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc index d4fa78518e149..bfe6c5a94776a 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc @@ -45,6 +45,9 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { rules_["slice"]["Input"] = ScaleAlgo::KL; rules_["slice"]["Out"] = ScaleAlgo::NONE; + rules_["shape"]["Input"] = ScaleAlgo::KL; + rules_["shape"]["Out"] = ScaleAlgo::NONE; + rules_["fc"]["Input"] = ScaleAlgo::KL; rules_["fc"]["W"] = ScaleAlgo::MAX_CH_T; rules_["fc"]["Bias"] = ScaleAlgo::NONE; @@ -62,6 +65,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { rules_["elementwise_mul"]["Y"] = ScaleAlgo::KL; rules_["elementwise_mul"]["Out"] = ScaleAlgo::KL; + rules_["elementwise_sub"]["X"] = ScaleAlgo::KL; + rules_["elementwise_sub"]["Y"] = ScaleAlgo::KL; + rules_["elementwise_sub"]["Out"] = ScaleAlgo::KL; + // Reshape2 does not perform calculation on the data and shapes are not // changed. Scale is calculated on input data and assign to Quantize and // Dequantize scale. diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index 622d54343f6a0..0d17673a2d522 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -63,8 +63,8 @@ def __init__(self, self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set( [-1]) self._scale_immutable_ops = [ - 'transpose2', 'reshape2', 'pool2d', 'slice', 'nearest_interp', - 'nearest_interp_v2' + 'transpose2', 'reshape2', 'pool2d', 'slice', 'shape', + 'nearest_interp', 'nearest_interp_v2' ] self._scale_ops = ['scale'] self._conv_ops = ['conv2d', 'depthwise_conv2d'] @@ -247,7 +247,7 @@ def _update_scales(graph): waiting_for_scale = set() for op in graph.all_op_nodes(): if op.name() in self._scale_immutable_ops: - if op.name() == 'slice': + if op.name() == 'slice' or op.name() == 'shape': input_name = op.input("Input")[0] else: input_name = op.input("X")[0] From 02e4f1f897b3b62a31923cb554c9674769da8e90 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 11 Jul 2022 19:36:42 +0800 Subject: [PATCH 125/250] Add Concat transformer for layout autotune (#42003) * Add Concat transformer for layout autotune --- paddle/fluid/imperative/layout_autotune.cc | 2 + paddle/fluid/imperative/layout_transformer.h | 46 +++++++++++++++++++ .../tests/unittests/test_layout_autotune.py | 29 ++++++++++++ 3 files changed, 77 insertions(+) diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc index 669a4af99f31f..10a4a2e69d540 100644 --- a/paddle/fluid/imperative/layout_autotune.cc +++ b/paddle/fluid/imperative/layout_autotune.cc @@ -131,6 +131,8 @@ paddle::imperative::NameVarMap DealLightlyLayoutSensitive( transposer = std::make_shared>(op_type); } else if (op_type == "arg_max") { transposer = std::make_shared>(op_type); + } else if (op_type == "concat") { + transposer = std::make_shared>(op_type); } else if (op_type.find("elementwise_") != std::string::npos) { transposer = std::make_shared>(op_type); } else { diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h index ab7619dedb2e9..fa7261b6d52b6 100644 --- a/paddle/fluid/imperative/layout_transformer.h +++ b/paddle/fluid/imperative/layout_transformer.h @@ -401,5 +401,51 @@ class ArgmaxOpTransformer } }; +template +class ConcatOpTransformer + : public LightlyLayoutSensitiveOpTransformer { + public: + explicit ConcatOpTransformer(const std::string& type) + : LightlyLayoutSensitiveOpTransformer(type) {} + + paddle::imperative::NameVarMap Apply( + const paddle::imperative::NameVarMap& ins, + const paddle::imperative::NameVarMap& outs, + paddle::framework::AttributeMap* attrs, + const std::shared_ptr& tracer) { + VLOG(3) << "Optimze lightly layout sensitive op " << this->Type(); + auto& in_var = ins.at("X")[0]; + auto var_layout = paddle::imperative::GetDataLayout(in_var); + bool need_tranppose = false; + for (auto& pair : ins) { + for (auto& var : pair.second) { + if (var != nullptr && + (paddle::imperative::GetDataLayout(var) != var_layout)) { + need_tranppose = true; + break; + } + } + } + + if (need_tranppose) { + return LightlyLayoutSensitiveOpTransformer::Apply( + ins, outs, attrs, tracer); + } + + if (var_layout != DataLayout::UNDEFINED) { + std::vector perm_nhwc = {0, 3, 1, 2}; + std::vector perm_nchw = {0, 2, 3, 1}; + auto perm = var_layout == DataLayout::NHWC ? perm_nhwc : perm_nchw; + auto axis = BOOST_GET_CONST(int, (*attrs)["axis"]); + (*attrs)["axis"] = static_cast(perm[axis]); + } + auto axis = BOOST_GET_CONST(int, (*attrs)["axis"]); + VLOG(3) << "Optimze lightly layout sensitive op asdfasdfasdf axis" << axis; + + this->SetVarsLayout(outs, var_layout); + return ins; + } +}; + } // namespace imperative } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py index fc9b51c5fc040..5cb53437fe9cd 100644 --- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py +++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py @@ -161,6 +161,35 @@ def test_argmax_op_transposer(self): self.assertEqual(conv_out.shape, [1, 14, 12, 8]) self.assertEqual(out.shape, [1]) + def test_concat_op_transposer(self): + if not self.use_autoune(): + return + in1 = paddle.rand([1, 8, 14, 12]) + conv = paddle.nn.Conv2D(3, 8, (3, 3)) + data = paddle.rand([1, 3, 16, 14]) + with paddle.amp.auto_cast(level="O2"): + conv_out = conv(data) + # conv_out.shape = [1, 14, 12, 8] with NHWC + out = paddle.concat(x=[conv_out, in1], axis=0) + + self.assertEqual(conv_out.shape, [1, 14, 12, 8]) + self.assertEqual(out.shape, [2, 8, 14, 12]) + + def test_concat_op_no_transposer(self): + if not self.use_autoune(): + return + conv = paddle.nn.Conv2D(3, 8, (3, 3)) + data1 = paddle.rand([1, 3, 16, 14]) + data2 = paddle.rand([1, 3, 16, 14]) + with paddle.amp.auto_cast(level="O2"): + conv_out1 = conv(data1) + conv_out2 = conv(data2) + # conv_out.shape = [1, 14, 12, 8] with NHWC + out = paddle.concat(x=[conv_out1, conv_out2], axis=0) + + self.assertEqual(conv_out1.shape, [1, 14, 12, 8]) + self.assertEqual(out.shape, [2, 14, 12, 8]) + class TestAutoTuneAPI(unittest.TestCase): From 37216a8fb9c95c0a477fc2eef67a8ca27f48b6a6 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Mon, 11 Jul 2022 20:20:51 +0800 Subject: [PATCH 126/250] [Dygraph] Support new apis in ProcessGroupNCCL (#43918) * fix conflict * new pg apis * add docs of new apis * update * fix coverage * update * fix bug * fix reduce scatter * fix api * update Co-authored-by: ForFishes <2282912238@qq.com> --- .../distributed/collective/ProcessGroup.h | 18 + .../collective/ProcessGroupNCCL.cc | 144 ++++- .../distributed/collective/ProcessGroupNCCL.h | 22 +- paddle/fluid/distributed/collective/Types.h | 4 + paddle/fluid/pybind/distributed_py.cc | 80 ++- python/paddle/distributed/__init__.py | 44 +- python/paddle/distributed/collective.py | 606 +++++++++++++++++- python/paddle/distributed/parallel.py | 2 + .../fluid/tests/unittests/CMakeLists.txt | 11 +- .../unittests/collective_alltoall_single.py | 86 +++ .../unittests/collective_batch_isend_irecv.py | 57 ++ .../unittests/collective_reduce_scatter.py | 98 +++ .../test_collective_alltoall_single.py | 32 + .../test_collective_batch_isend_irecv.py | 32 + .../test_collective_reduce_scatter.py | 32 + 15 files changed, 1193 insertions(+), 75 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/collective_alltoall_single.py create mode 100644 python/paddle/fluid/tests/unittests/collective_batch_isend_irecv.py create mode 100644 python/paddle/fluid/tests/unittests/collective_reduce_scatter.py create mode 100644 python/paddle/fluid/tests/unittests/test_collective_alltoall_single.py create mode 100644 python/paddle/fluid/tests/unittests/test_collective_batch_isend_irecv.py create mode 100644 python/paddle/fluid/tests/unittests/test_collective_reduce_scatter.py diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 48dd6d8285699..be3bfc0dc0029 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -46,6 +46,7 @@ enum class CommType : std::uint8_t { SEND = 9, RECV = 10, BARRIER = 11, + ALLTOALL_SINGLE = 12, UNKNOWN = 100, }; @@ -143,6 +144,15 @@ class ProcessGroup { "ProcessGroup%s does not support AllToAll", GetBackendName())); } + virtual std::shared_ptr AllToAll_Single( + std::vector&, // NOLINT + std::vector&, // NOLINT + std::vector&, + std::vector&) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllToAll_Single", GetBackendName())); + } + virtual std::shared_ptr Reduce( std::vector&, // NOLINT std::vector&, // NOLINT @@ -159,6 +169,14 @@ class ProcessGroup { "ProcessGroup%s does not support Scatter", GetBackendName())); } + virtual std::shared_ptr _ReduceScatterBase( + phi::DenseTensor&, // NOLINT + phi::DenseTensor&, // NOLINT + const ReduceScatterOptions&) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support ReduceScatter", GetBackendName())); + } + protected: const int rank_; const int size_; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index e6e69f0be3ae5..1beca8022e9f9 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -85,6 +85,34 @@ bool ProcessGroupNCCL::NCCLTask::IsCompleted() { return true; } +void ProcessGroupNCCL::CheckSplitSizes(std::vector& split_sizes, + std::vector tensor_shape) { + int64_t len_size = split_sizes.size(); + if (len_size == 0) { + PADDLE_ENFORCE_EQ(tensor_shape[0] % size_ == 0, + true, + platform::errors::InvalidArgument( + "Tensor's dim[0] must be divisible by group size " + "when split_sizes not given.")); + split_sizes.insert(split_sizes.end(), + size_, + static_cast(tensor_shape[0] / size_)); + } else { + PADDLE_ENFORCE_EQ( + len_size == size_, + true, + platform::errors::InvalidArgument( + "The length of split_sizes must be equal to group size.")); + auto sum_size = std::accumulate( + split_sizes.begin(), split_sizes.end(), static_cast(0)); + PADDLE_ENFORCE_EQ( + sum_size == tensor_shape[0], + true, + platform::errors::InvalidArgument( + "The sum of split_sizes must be equal to tensor's dim[0].")); + } +} + // TODO(sheniang03): Add timeout for wait, now timeout unused bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { SynchronizeStreams(); @@ -637,7 +665,69 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( } PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); }, - CommType::ALLREDUCE); + CommType::ALLTOALL); +} + +std::shared_ptr ProcessGroupNCCL::AllToAll_Single( + std::vector& in_tensors, + std::vector& out_tensors, + std::vector& in_sizes, + std::vector& out_sizes) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), + true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, + out_tensors, + [&](phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(), + true, + platform::errors::InvalidArgument( + "The dtypes of input and output must be equal.")); + + std::vector in_dims = phi::vectorize(input.dims()); + std::vector out_dims = phi::vectorize(output.dims()); + CheckSplitSizes(in_sizes, in_dims); + CheckSplitSizes(out_sizes, out_dims); + + size_t in_offset = 0, out_offset = 0; + size_t in_length = 0, out_length = 0; + size_t in_row_size = input.numel() / in_dims[0]; + size_t out_row_size = output.numel() / out_dims[0]; + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + in_length = in_sizes[i] * in_row_size; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input.data(), in_offset, input.dtype()), + in_length, + platform::ToNCCLDataType(input.dtype()), + i, + comm, + stream)); + in_offset += in_length; + + out_length = out_sizes[i] * out_row_size; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + GetPointerByOffset(output.data(), out_offset, input.dtype()), + out_length, + platform::ToNCCLDataType(input.dtype()), + i, + comm, + stream)); + out_offset += out_length; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + }, + CommType::ALLTOALL_SINGLE); } std::shared_ptr ProcessGroupNCCL::Reduce( @@ -721,5 +811,57 @@ std::shared_ptr ProcessGroupNCCL::Scatter( CommType::SCATTER); } +std::shared_ptr ProcessGroupNCCL::_ReduceScatterBase( + phi::DenseTensor& out_tensor, + phi::DenseTensor& in_tensor, + const ReduceScatterOptions& opts) { + // auto tensor = out_tensors.back(); + PADDLE_ENFORCE_EQ( + out_tensor.dtype(), + in_tensor.dtype(), + platform::errors::InvalidArgument( + "Input tensor and output tensor should be same dtype.")); + + PADDLE_ENFORCE_EQ( + out_tensor.numel() * size_, + in_tensor.numel(), + platform::errors::InvalidArgument("input tensor must be the same size as " + "output tensor size times world_size")); + + auto inputs = std::vector{in_tensor}; + auto outputs = std::vector{out_tensor}; + + return Collective( + inputs, + outputs, + [&](phi::DenseTensor& input, + phi::DenseTensor& output, + ncclComm_t comm, + const gpuStream_t& stream) { + if (FLAGS_use_stream_safe_cuda_allocator) { + platform::CUDADeviceGuard cuda_guard; + cuda_guard.SetDevice(output.place()); + memory::RecordStream(output.Holder(), stream); + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter( + input.data(), + output.data(), + output.numel(), + platform::ToNCCLDataType(input.dtype()), + ToNCCLRedType(opts.reduce_op), + comm, + stream)); + }, + CommType::REDUCE_SCATTER); +} + +void ProcessGroupNCCL::GroupStart() { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); +} + +void ProcessGroupNCCL::GroupEnd() { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index a26f5947ce2b8..a8adffe64e70d 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -129,6 +129,12 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector& in, std::vector& out) override; + std::shared_ptr AllToAll_Single( + std::vector& in, + std::vector& out, + std::vector& in_sizes, + std::vector& out_sizes) override; + std::shared_ptr Reduce( std::vector& tensors, std::vector& out_tensors, @@ -139,6 +145,15 @@ class ProcessGroupNCCL : public ProcessGroup { std::vector& out_tensors, const ScatterOptions&) override; + std::shared_ptr _ReduceScatterBase( + phi::DenseTensor&, // NOLINT + phi::DenseTensor&, // NOLINT + const ReduceScatterOptions&) override; + + static void GroupStart(); + + static void GroupEnd(); + protected: virtual std::shared_ptr CreateTask( std::vector places, @@ -162,8 +177,8 @@ class ProcessGroupNCCL : public ProcessGroup { std::set used_place_ids_; private: - void BcastNCCLId(std::vector& nccl_ids, - int root, // NOLINT + void BcastNCCLId(std::vector& nccl_ids, // NOLINT + int root, // NOLINT int server_fd); void BroadcastUniqueNCCLID(std::vector& nccl_ids); // NOLINT @@ -190,6 +205,9 @@ class ProcessGroupNCCL : public ProcessGroup { void CreateNCCLManagerCache(const std::string& places_key, const std::vector& places); + + void CheckSplitSizes(std::vector& split_sizes, + std::vector tensor_shape); }; } // namespace distributed diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h index 973f7c6435427..0ce92111f6a13 100644 --- a/paddle/fluid/distributed/collective/Types.h +++ b/paddle/fluid/distributed/collective/Types.h @@ -45,5 +45,9 @@ struct ScatterOptions { int root_rank = 0; }; +struct ReduceScatterOptions { + ReduceOp reduce_op = ReduceOp::SUM; +}; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index bdaebf13f8d2a..b8d5a0de820e7 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -225,6 +225,30 @@ void BindDistributed(py::module *m) { py::arg("out"), py::call_guard()) + .def( + "alltoall_single", + [](distributed::ProcessGroup &self, + py::handle py_in_tensor, + py::handle py_out_tensor, + std::vector in_sizes, + std::vector out_sizes) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; + return self.AllToAll_Single( + in_tensors, out_tensors, in_sizes, out_sizes); + }, + py::arg("in"), + py::arg("out"), + py::arg("in_sizes"), + py::arg("out_sizes"), + py::call_guard()) + .def( "reduce", [](distributed::ProcessGroup &self, @@ -244,7 +268,6 @@ void BindDistributed(py::module *m) { py::arg("dst"), py::arg("op") = distributed::ReduceOp::SUM, py::call_guard()) - .def( "scatter", [](distributed::ProcessGroup &self, @@ -266,23 +289,50 @@ void BindDistributed(py::module *m) { py::arg("in"), py::arg("out"), py::arg("src"), + py::call_guard()) + .def( + "_reduce_scatter_base", + [](distributed::ProcessGroup &self, + py::handle py_out_tensor, + py::handle py_in_tensor, + distributed::ReduceOp op) { + auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); + auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); + distributed::ReduceScatterOptions opts; + opts.reduce_op = op; + auto dense_out = std::dynamic_pointer_cast( + out_tensor.impl()); + auto dense_in = std::dynamic_pointer_cast( + in_tensor.impl()); + return self._ReduceScatterBase(*dense_out, *dense_in, opts); + }, + py::arg("out_tensor"), + py::arg("in_tensor"), + py::arg("op") = distributed::ReduceOp::SUM, py::call_guard()); #if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) - py::class_>( - *m, "ProcessGroupNCCL", ProcessGroup) - .def(py::init &, - int, - int, - const platform::CUDAPlace &, - int>(), - py::arg("store"), - py::arg("rank"), - py::arg("world_size"), - py::arg("place"), - py::arg("group_id") = 0, - py::call_guard()); + auto processGroupNCCL = + py::class_>( + *m, "ProcessGroupNCCL", ProcessGroup) + .def(py::init &, + int, + int, + const platform::CUDAPlace &, + int>(), + py::arg("store"), + py::arg("rank"), + py::arg("world_size"), + py::arg("place"), + py::arg("group_id") = 0, + py::call_guard()); + + processGroupNCCL.def_static( + "group_start", []() { distributed::ProcessGroupNCCL::GroupStart(); }); + processGroupNCCL.def_static( + "group_end", []() { distributed::ProcessGroupNCCL::GroupEnd(); }); + #endif #if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \ diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py index 003a14799c53e..ab83e2929e4bc 100644 --- a/python/paddle/distributed/__init__.py +++ b/python/paddle/distributed/__init__.py @@ -41,6 +41,14 @@ from .collective import get_group # noqa: F401 from .collective import send # noqa: F401 from .collective import wait # noqa: F401 +from .collective import is_initialized # noqa: F401 +from .collective import destroy_process_group # noqa: F401 +from .collective import alltoall_single # noqa: F401 +from .collective import isend # noqa: F401 +from .collective import irecv # noqa: F401 +from .collective import batch_isend_irecv # noqa: F401 +from .collective import P2POp # noqa: F401 +from .collective import reduce_scatter # noqa: F401 from .auto_parallel import shard_op # noqa: F401 from .auto_parallel import shard_tensor # noqa: F401 @@ -59,33 +67,11 @@ from .sharding import * # noqa: F401 __all__ = [ # noqa - "spawn", - "launch", - "scatter", - "broadcast", - "ParallelEnv", - "new_group", - "init_parallel_env", - "gloo_init_parallel_env", - "gloo_barrier", - "gloo_release", - "QueueDataset", - "split", - "CountFilterEntry", - "ShowClickEntry", - "get_world_size", - "get_group", - "all_gather", - "InMemoryDataset", - "barrier", - "all_reduce", - "alltoall", - "send", - "reduce", - "recv", - "ReduceOp", - "wait", - "get_rank", - "ProbabilityEntry", - "ParallelMode", + "spawn", "launch", "scatter", "broadcast", "ParallelEnv", "new_group", + "init_parallel_env", "gloo_init_parallel_env", "gloo_barrier", + "gloo_release", "QueueDataset", "split", "CountFilterEntry", + "ShowClickEntry", "get_world_size", "get_group", "all_gather", + "InMemoryDataset", "barrier", "all_reduce", "alltoall", "send", "reduce", + "recv", "ReduceOp", "wait", "get_rank", "ProbabilityEntry", "ParallelMode", + "is_initialized", "isend", "irecv", "reduce_scatter" ] diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index cb634a4b6ac1a..2506c3073941a 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -36,6 +36,7 @@ import paddle.fluid.core as core from paddle import _C_ops import paddle.fluid.dygraph_utils as dygraph_utils +import contextlib __all__ = [] @@ -136,6 +137,10 @@ def _get_global_env(): # Dict[name, Group] _group_map_by_name = {} +# backend map by group : the map of all backend from their groups +# Dict[group, backend] +_group_map_backend = {} + # Name of the default group for init_parallel_env _default_group_name = "_default_pg" @@ -175,9 +180,8 @@ def _get_group_map_by_name(): def _get_default_group(): global _group_map_by_name - assert _default_group_name in _group_map_by_name, ( - "Call paddle.distributed.init_parallel_env first " - "to initialize the distributed environment.") + assert is_initialized(), ("Call paddle.distributed.init_parallel_env first " + "to initialize the distributed environment.") return _get_group_map_by_name()[_default_group_name] @@ -193,10 +197,29 @@ def _set_group_map_by_name(name, group): _group_map_by_name[name] = group +def _set_group_map_backend(group, backend): + global _group_map_backend + assert group not in _group_map_backend + _group_map_backend[group] = backend + + def _new_ring_id(): return len(_get_group_map()) + max(_get_global_env().nrings, 9) +def _get_reduce_op(reduce_op, func_name): + if reduce_op == ReduceOp.SUM: + return core.ReduceOp.SUM + elif reduce_op == ReduceOp.MAX: + return core.ReduceOp.MAX + elif reduce_op == ReduceOp.MIN: + return core.ReduceOp.MIN + elif reduce_op == ReduceOp.PROD: + return core.ReduceOp.PRODUCT + else: + raise ValueError("Unknown reduce_op type for {}.".format(func_name)) + + def get_group(id=0): """ @@ -400,6 +423,7 @@ def new_group(ranks=None, backend=None): group = Group(rank, size, id=gid, ranks=ranks, pg=pg, name=group_name) _group_map_by_name[group_name] = group _group_map[gid] = group + _group_map_backend[group] = backend # TODO(shenliang03): This is a temporary solution to solve the problem of # hang caused by tcp @@ -462,6 +486,75 @@ def new_group(ranks=None, backend=None): return gp +def is_initialized(): + """ + + Check whether the distributed environment has been initialized + + Returns (bool): `True` if distributed environment has been initialized, otherwise `False`. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + + print(paddle.distributed.is_initialized()) + # False + + paddle.distributed.init_parallel_env() + print(paddle.distributed.is_initialized()) + # True + + """ + global _group_map_by_name + return _default_group_name in _group_map_by_name + + +def destroy_process_group(group=None): + """ + Destroy a given group for communication + + Args: + group (ProcessGroup, optional): The group to be destroyed. All of process groups, including + the default group, will be destroyed and the distributed + environment will be deinitialized. + + Returns : None + + Examples: + .. code-block:: python + + # required: distributed + import paddle + + paddle.distributed.init_parallel_env() + group = paddle.distributed.new_group([0, 1]) + + paddle.distributed.destroy_process_group(group) + print(paddle.distributed.is_initialized()) + # True + paddle.distributed.destroy_process_group() + print(paddle.distributed.is_initialized()) + # False + + """ + global _group_map + global _group_map_by_name + + pg = _get_default_group() if group is None else group + assert _group_map.get(pg.id, None) is not None, "Invalid group." + + if group is None: + _group_map.clear() + _group_map_by_name.clear() + _group_map_backend.clear() + else: + del _group_map[pg.id] + del _group_map_by_name[pg.name] + del _group_map_backend[pg] + + def wait(tensor, group=None, use_calc_stream=True): """ @@ -663,16 +756,7 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True): return if in_dygraph_mode(): - if op == ReduceOp.SUM: - op_type = core.ReduceOp.SUM - elif op == ReduceOp.MAX: - op_type = core.ReduceOp.MAX - elif op == ReduceOp.MIN: - op_type = core.ReduceOp.MIN - elif op == ReduceOp.PROD: - op_type = core.ReduceOp.PRODUCT - else: - raise ValueError("Unknown reduce_op type for allreduce.") + op_type = _get_reduce_op(op, "all_reduce") group = _get_default_group() if group is None else group task = group.process_group.allreduce(tensor, op_type) if use_calc_stream: @@ -768,16 +852,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True): return if in_dygraph_mode(): - if op == ReduceOp.SUM: - op_type = core.ReduceOp.SUM - elif op == ReduceOp.MAX: - op_type = core.ReduceOp.MAX - elif op == ReduceOp.MIN: - op_type = core.ReduceOp.MIN - elif op == ReduceOp.PROD: - op_type = core.ReduceOp.PRODUCT - else: - raise ValueError("Unknown reduce_op type for reduce.") + op_type = _get_reduce_op(op, "reduce") group = _get_default_group() if group is None else group gdst = group.get_group_rank(dst) assert gdst >= 0, ("dst rank out of group, need global rank") @@ -1781,10 +1856,10 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True): Args: in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type should be float16, float32, float64, int32 or int64. - out_tensor_list (Tensor): A list of output Tensors. The data type of its elements should be the same as the + out_tensor_list (list): A list of output Tensors. The data type of its elements should be the same as the data type of the input Tensors. group (Group, optional): The group instance return by new_group or None for global default group. Default: None. - use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True. + use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True. Returns: None. @@ -1867,6 +1942,94 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True): out_tensor_list.extend(paddle.split(out, nranks, 0)) +def alltoall_single(in_tensor, + out_tensor, + in_split_sizes=None, + out_split_sizes=None, + group=None, + use_calc_stream=True): + """ + Scatter a single input tensor to all participators and gather the received tensors in out_tensor. + + .. note:: + ``alltoall_single`` is only supported in eager mode. + + Args: + in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32 or int64. + out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor. + in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor`` + must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None. + out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor`` + must be divisible by group size and ``out_tensor`` will be gathered averagely from all participators. Default: None. + group (Group, optional): The group instance return by ``new_group`` or None for global default group. Default: None. + use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True. + + Returns: + None, if ``use_calc_stream`` is set to ``True``; ``Task`` of ``group``, if ``use_calc_stream`` is set to ``False``. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + rank = dist.get_rank() + size = dist.get_world_size() + + # case 1 + input = paddle.arange(2, dtype='int64') + rank * 2 + # input for rank 0: [0, 1] + # input for rank 1: [2, 3] + + output = paddle.empty([2], dtype='int64') + dist.alltoall_single(input, output) + # output for rank 0: [0, 2] + # output for rank 1: [1, 3] + + # case 2 + in_split_sizes = [i + 1 for i in range(size)] + # in_split_sizes for rank 0: [1, 2] and for rank 1: [1, 2] + out_split_sizes = [rank + 1 for i in range(size)] + # out_split_sizes for rank 0: [1, 1] and for rank 1: [2, 2] + + input = paddle.ones([sum(in_split_sizes), size], dtype='float32') * rank + # input for rank 0: [[0., 0.], [0., 0.], [0., 0.]] + # input for rank 1: [[1., 1.], [1., 1.], [1., 1.]] + output = paddle.empty([(rank + 1) * size, size], dtype='float32') + + group = dist.new_group([0, 1]) + task = dist.alltoall_single(input, + output, + in_split_sizes, + out_split_sizes, + use_calc_stream=False, + group=group) + task.wait() + # output for rank 0: [[0., 0.], [1., 1.]] + # output for rank 1: [[0., 0.], [0., 0.], [1., 1.], [1., 1.]] + + """ + if group is not None and not group.is_member(): + return + + assert in_dygraph_mode(), "Only suppport alltoall_single in eager mode." + # _check_single_tensor + + group = _get_default_group() if group is None else group + in_split_sizes = [] if in_split_sizes is None else in_split_sizes + out_split_sizes = [] if out_split_sizes is None else out_split_sizes + + task = group.process_group.alltoall_single(in_tensor, out_tensor, + in_split_sizes, out_split_sizes) + if use_calc_stream: + task.wait() + return + else: + return task + + def send(tensor, dst=0, group=None, use_calc_stream=True): """ Send a tensor to the receiver. @@ -1902,7 +2065,8 @@ def send(tensor, dst=0, group=None, use_calc_stream=True): if in_dygraph_mode(): group = _get_default_group() if group is None else group - task = group.process_group.send(tensor, dst) + group_dst_rank = group.get_group_rank(dst) + task = group.process_group.send(tensor, group_dst_rank) if use_calc_stream: task.wait() return None @@ -1964,7 +2128,8 @@ def recv(tensor, src=0, group=None, use_calc_stream=True): if in_dygraph_mode(): group = _get_default_group() if group is None else group - task = group.process_group.recv(tensor, src) + group_src_rank = group.get_group_rank(src) + task = group.process_group.recv(tensor, group_src_rank) if use_calc_stream: task.wait() return None @@ -1991,3 +2156,390 @@ def recv(tensor, src=0, group=None, use_calc_stream=True): 'dtype': tensor.dtype, 'use_calc_stream': use_calc_stream, }) + + +def _check_single_tensor(tensor, tensor_name): + if not isinstance(tensor, (core.eager.Tensor, paddle.Tensor)): + raise RuntimeError("Invalid function argument. Expected parameter {}" + "to be of type paddle.Tensor, but it's {}".format( + tensor_name, type(tensor))) + + +def _check_tensor_list(tensor_list, tensor_name): + if not isinstance(tensor_list, list) or \ + not all(isinstance(t, (core.eager.Tensor, paddle.Tensor)) for t in tensor_list): + raise RuntimeError("Invalid function argument. Expected parameter {}" + "to be of type paddle.Tensor".format(tensor_name)) + + +def isend(tensor, dst, group=None): + """ + Sends a tensor asynchronously + + Args: + tensor (Tensor): The Tensor to send. Its data type + should be float16, float32, float64, int32 or int64. + dst (int): The destination rank. + group (Group, optional): The group instance return by new_group or None for global default group. Default: None. + + Returns: + A distributed task object. + + Warning: + This API only supports the dygraph mode. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + rank = dist.get_rank() + world_size = dist.get_world_size() + + if rank == 0: + data = paddle.to_tensor([7, 8, 9]) + task = paddle.distributed.isend(data, dst=1) + else: + data = paddle.to_tensor([1, 2, 3]) + task = paddle.distributed.irecv(data, src=0) + + task.wait() + + print(data) + # paddle.tensor([7, 8, 9]) # Rank-0 + # paddle.tensor([7, 8, 9]) # Rank-1 + + """ + _check_single_tensor(tensor, "tensor") + if group is not None and not group.is_member(): + return + + if in_dygraph_mode(): + group = _get_default_group() if group is None else group + group_dst_rank = group.get_group_rank(dst) + assert group_dst_rank >= 0, ("dst rank out of group, need global rank") + return group.process_group.send(tensor, group_dst_rank) + else: + raise RuntimeError("Don't support static graph mode currently.") + + +def irecv(tensor, src=None, group=None): + """ + Receive a tensor to the sender. + + Args: + tensor (Tensor): The Tensor to receive. Its data type + should be float16, float32, float64, int32 or int64. + src (int): The source rank id. + group (Group, optional): The group instance return by new_group or None for global default group. Default: None. + + Returns: + A distributed task object. + + Warning: + This API only supports the dygraph mode. + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + rank = dist.get_rank() + world_size = dist.get_world_size() + + if rank == 0: + data = paddle.to_tensor([7, 8, 9]) + task = paddle.distributed.isend(data, dst=1) + else: + data = paddle.to_tensor([1, 2, 3]) + task = paddle.distributed.irecv(data, src=0) + + task.wait() + + print(data) + # paddle.tensor([7, 8, 9]) # Rank-0 + # paddle.tensor([7, 8, 9]) # Rank-1 + """ + _check_single_tensor(tensor, "tensor") + if group is not None and not group.is_member(): + return + + if in_dygraph_mode(): + group = _get_default_group() if group is None else group + group_src_rank = group.get_group_rank(src) + assert group_src_rank >= 0, ("src rank out of group, need global rank") + return group.process_group.recv(tensor, group_src_rank) + else: + raise RuntimeError("Don't support static graph mode currently.") + + +class P2POp(object): + """ + A class that makes point-to-point operations for "batch_isend_irecv". + + This class creates the type of P2P operation, communication buffer, peer rank, + Group. Instances of this class will be passed to + ``paddle.distributed.batch_isend_irecv`` for point-to-point communication. + + Args: + op (callable): A function to send data to or receive data from a peer process. + The type of ``op`` is either ``paddle.distributed.isend`` or ``paddle.distributed.irecv``. + tensor (Tensor): Tensor to send or receive. + peer (int): The destination or source rank. + group (Group, optional): The group instance return by new_group or None for global + default group. Default: None. + + """ + + def __init__(self, op, tensor, peer, group=None): + if op not in [isend, irecv]: + raise RuntimeError("Invalid ``op`` function. Expected ``op`` " + "to be of type ``paddle.distributed.isend`` or " + "``paddle.distributed.irecv``.") + _check_single_tensor(tensor, "tensor") + + self.op = op + self.tensor = tensor + self.peer = peer + self.group = _get_default_group() if group is None else group + + +@contextlib.contextmanager +def _with_batch_p2p_guard(backend): + if backend == "nccl": + core.ProcessGroupNCCL.group_start() + try: + yield + finally: + if backend == "nccl": + core.ProcessGroupNCCL.group_end() + + +def _check_p2p_op_list(p2p_op_list): + """ + Helper to check that the ``p2p_op_list`` is a list of P2POp instances and + all ops use the same backend. + """ + if not isinstance(p2p_op_list, list) or not all( + isinstance(p2p_op, P2POp) for p2p_op in p2p_op_list): + raise RuntimeError("Invalid ``p2p_op_list``. Each op is expected to " + "to be of type ``paddle.distributed.P2POp``.") + + backend = _group_map_backend[p2p_op_list[0].group] + if not all(backend == _group_map_backend[p2p_op.group] + for p2p_op in p2p_op_list): + raise RuntimeError("All groups need to use the same backend.") + + +def batch_isend_irecv(p2p_op_list): + """ + Send or Receive a batch of tensors asynchronously and return a list of requests. + + Process each of the point-to-point operations in ``p2p_op_list`` and return the + corresponding tasks. NCCL are currently supported. + + Args: + p2p_op_list: A list of point-to-point operations(type of each operator is + ``paddle.distributed.P2POp``). The order of the isend/irecv in the list + matters and it needs to match with corresponding isend/irecv on the + remote end. + + Returns: + A list of distributed tasks returned by calling the corresponding + op in the op_list. + + Warning: + This API only supports the dygraph mode. + + Examples: + .. code-block:: python + + # required: distributed + + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + rank = dist.get_rank() + world_size = dist.get_world_size() + + send_t = paddle.arange(2) + rank + # paddle.tensor([0, 1]) # Rank-0 + # paddle.tensor([1, 2]) # Rank-1 + + recv_t = paddle.empty(shape=[2], dtype=send_t.dtype) + + send_op = dist.P2POp(dist.isend, send_t, (rank + 1) % world_size) + recv_op = dist.P2POp(dist.irecv, recv_t, (rank - 1 + world_size) % world_size) + + tasks = dist.batch_isend_irecv([send_op, recv_op]) + + for task in tasks: + task.wait() + + print(recv_t) + # paddle.tensor([1, 2]) # Rank-0 + # paddle.tensor([0, 1]) # Rank-1 + """ + _check_p2p_op_list(p2p_op_list) + group = p2p_op_list[0].group + if group is not None and not group.is_member(): + return + + if in_dygraph_mode(): + group = _get_default_group() if group is None else group + backend = _group_map_backend[group] + tasks = [] + with _with_batch_p2p_guard(backend): + for p2p_op in p2p_op_list: + op = p2p_op.op + tensor = p2p_op.tensor + peer = p2p_op.peer + comm_group = p2p_op.group + task = op(tensor, peer, comm_group) + if task is not None: + tasks.append(task) + return tasks + else: + raise RuntimeError("Don't support static graph mode currently.") + + +def reduce_scatter(tensor, + tensor_list, + op=ReduceOp.SUM, + group=None, + use_calc_stream=True): + """ + Reduces, then scatters a list of tensors to all processes in a group + + Args: + tensor (Tensor): Output tensor. + tensor_list (list[Tensor]): List of tensors to reduce and scatter. + op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM. + group (Group, optional): The group instance return by new_group or None for global + default group. Default: None. + use_calc_stream (bool, optional): Whether this op should be an async op. + + Returns: + Async task handle, if use_calc_stream is set to False. + None, if use_calc_stream or if not part of the group. + + Warning: + This API only supports the dygraph mode. + + + Examples: + .. code-block:: python + + # required: distributed + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + rank = dist.get_rank() + world_size = dist.get_world_size() + + if rank == 0: + t1 = paddle.to_tensor([0, 1]) + t2 = paddle.to_tensor([2, 3]) + else: + t1 = paddle.to_tensor([4, 5]) + t2 = paddle.to_tensor([6, 7]) + + tensor_list = [t1, t2] + + output = paddle.empty(shape=[2], dtype=tensor_list[0].dtype) + dist.reduce_scatter(output, tensor_list) + + print(output) + # [4, 6] # Rank-0 + # [8, 10] # Rank-1 + + """ + _check_single_tensor(tensor, "tensor") + _check_tensor_list(tensor_list, "tensor_list") + + if group is not None and not group.is_member(): + return + + if in_dygraph_mode(): + op_type = _get_reduce_op(op, "reduce_scatter") + group = _get_default_group() if group is None else group + + temp = paddle.concat(tensor_list, axis=0) + task = group.process_group._reduce_scatter_base(tensor, temp, op_type) + if use_calc_stream: + task.wait() + return None + else: + return task + else: + raise RuntimeError("Don't support static graph mode currently.") + + +def _reduce_scatter_base(output, + input, + op=ReduceOp.SUM, + group=None, + use_calc_stream=True): + """ + Reduces, then scatters a flattened tensor to all processes in a group. + + Args: + output (Tensor): Output tensor. + input (Tensor): Input tensor that is of size output tensor size times world size + op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM. + group (ProcessGroup, optional): The process group to work on. If None, + the default process group will be used. + use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream (False). + Default to True. + Returns: + Async task handle, if use_calc_stream is set to False. + None, if use_calc_stream or if not part of the group. + + Examples: + .. code-block:: python + + # required: distributed + + import paddle + import paddle.distributed as dist + + dist.init_parallel_env() + rank = dist.get_rank() + world_size = dist.get_world_size() + + input = paddle.arange(4) + rank + # [0, 1, 2, 3] # Rank-0 + # [1, 2, 3, 4] # Rank-1 + + output = paddle.empty(shape=[2], dtype=input.dtype) + paddle.distributed.collective._reduce_scatter_base(output, input) + print(output) + # [1, 3] # Rank-0 + # [5, 7] # Rank-1 + + """ + _check_single_tensor(output, "output") + _check_single_tensor(input, "input") + + if group is not None and not group.is_member(): + return + + if in_dygraph_mode(): + op_type = _get_reduce_op(op, "_reduce_scatter_base") + group = _get_default_group() if group is None else group + task = group.process_group._reduce_scatter_base(output, input, op_type) + if use_calc_stream: + task.wait() + return None + else: + return task + else: + raise RuntimeError("Don't support static graph mode currently.") diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 52d19ae52b2ba..e95b771fe6f6a 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -42,6 +42,7 @@ from paddle.distributed.collective import _set_default_store from paddle.distributed.collective import _new_process_group_impl from paddle.distributed.collective import Group +from paddle.distributed.collective import _set_group_map_backend __all__ = [] @@ -257,6 +258,7 @@ def train(): name=_default_group_name) _set_group_map_by_name(_default_group_name, group) _set_group_map(0, group) + _set_group_map_backend(group, backend) parallel_helper._set_parallel_ctx(True) paddle.distributed.barrier(group=group) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 06bec07d7acaf..606f39c5e3b42 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -72,7 +72,10 @@ list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard) list(APPEND DIST_TEST_OPS test_auto_parallel_save_load) list(APPEND DIST_TEST_OPS test_auto_parallel_autoconvert) list(APPEND DIST_TEST_OPS test_collective_process_group) +list(APPEND DIST_TEST_OPS test_collective_alltoall_single) list(APPEND DIST_TEST_OPS test_eager_dist_api) +list(APPEND DIST_TEST_OPS test_collective_batch_isend_irecv) +list(APPEND DIST_TEST_OPS test_collective_reduce_scatter) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) #remove distribute unittests. list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) @@ -334,7 +337,11 @@ if((NOT WITH_GPU) AND (NOT WITH_ROCM)) list(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load) list(REMOVE_ITEM TEST_OPS test_auto_parallel_autoconvert) list(REMOVE_ITEM TEST_OPS test_collective_process_group) + list(REMOVE_ITEM TEST_OPS test_collective_alltoall_single) list(REMOVE_ITEM TEST_OPS test_eager_dist_api) + list(REMOVE_ITEM TEST_OPS test_collective_batch_isend_irecv) + list(REMOVE_ITEM TEST_OPS test_collective_reduce_scatter) + elseif(WITH_GPU) if(${CUDNN_VERSION} VERSION_LESS 7100) list(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) @@ -1569,8 +1576,10 @@ if(WITH_DISTRIBUTE set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120) set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_alltoall_single PROPERTIES TIMEOUT 60) set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT 100) - + set_tests_properties(test_collective_batch_isend_irecv PROPERTIES TIMEOUT 100) + set_tests_properties(test_collective_reduce_scatter PROPERTIES TIMEOUT 100) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 200) diff --git a/python/paddle/fluid/tests/unittests/collective_alltoall_single.py b/python/paddle/fluid/tests/unittests/collective_alltoall_single.py new file mode 100644 index 0000000000000..cb6777d20bc25 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective_alltoall_single.py @@ -0,0 +1,86 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest + +import paddle +import numpy as np +import random +import paddle.distributed as dist +import paddle.fluid as fluid +import paddle.distributed.fleet as fleet +from paddle import framework + + +class TestCollectiveAllToAllSingle(unittest.TestCase): + + def setUp(self): + assert not paddle.distributed.is_initialized(), \ + "The distributed environment has not been initialized." + dist.init_parallel_env() + assert paddle.distributed.is_initialized(), \ + "The distributed environment has been initialized." + + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + + def test_collective_alltoall_single(self): + rank = dist.get_rank() + size = dist.get_world_size() + + # case 1 + input = paddle.ones([size, size], dtype='int64') * rank + output = paddle.empty([size, size], dtype='int64') + expected_output = paddle.concat( + [paddle.ones([1, size], dtype='int64') * i for i in range(size)]) + + group = dist.new_group([0, 1]) + dist.alltoall_single(input, output, group=group) + + np.testing.assert_allclose(output.numpy(), expected_output.numpy()) + dist.destroy_process_group(group) + + # case 2 + in_split_sizes = [i + 1 for i in range(size)] + out_split_sizes = [rank + 1 for i in range(size)] + + input = paddle.ones([sum(in_split_sizes), size], dtype='float32') * rank + output = paddle.empty([(rank + 1) * size, size], dtype='float32') + expected_output = paddle.concat([ + paddle.ones([rank + 1, size], dtype='float32') * i + for i in range(size) + ]) + + group = dist.new_group([0, 1]) + task = dist.alltoall_single(input, + output, + in_split_sizes, + out_split_sizes, + use_calc_stream=False, + group=group) + task.wait() + + np.testing.assert_allclose(output.numpy(), expected_output.numpy()) + dist.destroy_process_group(group) + + def tearDown(self): + dist.destroy_process_group() + assert not paddle.distributed.is_initialized(), \ + "The distributed environment has been deinitialized." + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective_batch_isend_irecv.py b/python/paddle/fluid/tests/unittests/collective_batch_isend_irecv.py new file mode 100644 index 0000000000000..5aa309a2bbe5d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective_batch_isend_irecv.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest + +import paddle +import numpy as np +import random +import paddle.distributed as dist +import paddle.fluid as fluid +import paddle.distributed.fleet as fleet +from paddle import framework + + +class TestCollectiveBatchIsendIrecv(unittest.TestCase): + + def setUp(self): + dist.init_parallel_env() + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + + def test_collective_batch_isend_irecv(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + send_t = paddle.arange(2) + rank + # paddle.tensor([0, 1]) # Rank-0 + # paddle.tensor([1, 2]) # Rank-1 + recv_t = paddle.empty(shape=[2], dtype=send_t.dtype) + send_op = dist.P2POp(dist.isend, send_t, (rank + 1) % world_size) + recv_op = dist.P2POp(dist.irecv, recv_t, + (rank - 1 + world_size) % world_size) + tasks = dist.batch_isend_irecv([send_op, recv_op]) + + for task in tasks: + task.wait() + + if rank == 0: + np.testing.assert_allclose(recv_t.numpy(), [1, 2]) + elif rank == 1: + np.testing.assert_allclose(recv_t.numpy(), [0, 1]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_scatter.py b/python/paddle/fluid/tests/unittests/collective_reduce_scatter.py new file mode 100644 index 0000000000000..0e36296e4089c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective_reduce_scatter.py @@ -0,0 +1,98 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest + +import paddle +import numpy as np +import random +import paddle.distributed as dist +import paddle.fluid as fluid +import paddle.distributed.fleet as fleet +from paddle import framework + + +class TestCollectiveReduceScatter(unittest.TestCase): + + def setUp(self): + dist.init_parallel_env() + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + + def test_collective_reduce_scatter_sum(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + + if rank == 0: + t1 = paddle.to_tensor([0, 1]) + t2 = paddle.to_tensor([2, 3]) + else: + t1 = paddle.to_tensor([4, 5]) + t2 = paddle.to_tensor([6, 7]) + + input_list = [t1, t2] + + output = paddle.empty(shape=[2], dtype=input_list[0].dtype) + dist.reduce_scatter(output, input_list) + + if rank == 0: + np.testing.assert_allclose(output.numpy(), [4, 6]) + elif rank == 1: + np.testing.assert_allclose(output.numpy(), [8, 10]) + + def test_collective_reduce_scatter_max(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + + if rank == 0: + t1 = paddle.to_tensor([0, 1], dtype="float16") + t2 = paddle.to_tensor([2, 3], dtype="float16") + else: + t1 = paddle.to_tensor([4, 5], dtype="float16") + t2 = paddle.to_tensor([6, 7], dtype="float16") + + input_list = [t1, t2] + + output = paddle.empty(shape=[2], dtype=input_list[0].dtype) + dist.reduce_scatter(output, input_list, op=dist.ReduceOp.MAX) + + if rank == 0: + np.testing.assert_allclose(output.numpy(), [4, 5]) + elif rank == 1: + np.testing.assert_allclose(output.numpy(), [6, 7]) + + def test_collective_reduce_scatter_base(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + + input = paddle.arange(4) + rank + # [0, 1, 2, 3] # Rank-0 + # [1, 2, 3, 4] # Rank-1 + + output = paddle.empty(shape=[2], dtype=input.dtype) + task = paddle.distributed.collective._reduce_scatter_base( + output, input, use_calc_stream=False) + + task.wait() + + if rank == 0: + np.testing.assert_allclose(output.numpy(), [1, 3]) + elif rank == 1: + np.testing.assert_allclose(output.numpy(), [5, 7]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_collective_alltoall_single.py b/python/paddle/fluid/tests/unittests/test_collective_alltoall_single.py new file mode 100644 index 0000000000000..e848404850d9e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_collective_alltoall_single.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestCollectiveAllToAllSingle(TestMultipleGpus): + + def test_collective_alltoall_single(self): + self.run_mnist_2gpu('collective_alltoall_single.py', eager_mode=True) + + +if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_collective_batch_isend_irecv.py b/python/paddle/fluid/tests/unittests/test_collective_batch_isend_irecv.py new file mode 100644 index 0000000000000..a93c417b99c65 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_collective_batch_isend_irecv.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestCollectiveBatchIsendIrecv(TestMultipleGpus): + + def test_collective_batch_isend_irecv(self): + self.run_mnist_2gpu('collective_batch_isend_irecv.py', eager_mode=True) + + +if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_scatter.py new file mode 100644 index 0000000000000..93d181243b1fa --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_scatter.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestCollectiveReduceScatter(TestMultipleGpus): + + def test_collective_reduce_scatter(self): + self.run_mnist_2gpu('collective_reduce_scatter.py', eager_mode=True) + + +if __name__ == "__main__": + os.environ["FLAGS_enable_eager_mode"] = "1" + unittest.main() From dd63e5b46911cff800159176085a9c7b82b5430a Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Mon, 11 Jul 2022 20:22:49 +0800 Subject: [PATCH 127/250] reorganize the higher order autodiff api (#44119) * move _gradients to primapi and rename to grad * modify jvp to call forward_grad in primitive mode * add primapi unittest and remove some unused test cases. * fix circular import problem * move paddle/autograd/functional into paddle/incubate.autograd/functional * remove unused JacobianBatchLast class --- python/paddle/autograd/__init__.py | 2 - python/paddle/autograd/functional.py | 1362 ----------------- python/paddle/autograd/utils.py | 26 - python/paddle/fluid/backward.py | 6 - .../tests/unittests/autograd/CMakeLists.txt | 2 +- .../test_autograd_functional_dynamic.py | 899 +---------- .../autograd/test_autograd_functional_prim.py | 125 ++ .../test_autograd_functional_static.py | 14 +- ...ients_and_minimize.py => test_minimize.py} | 72 +- .../tests/unittests/autograd/test_primapi.py | 131 +- .../tests/unittests/autograd/test_primops.py | 2 +- .../fluid/tests/unittests/autograd/utils.py | 2 +- python/paddle/incubate/autograd/__init__.py | 9 +- python/paddle/incubate/autograd/functional.py | 675 ++++++++ python/paddle/incubate/autograd/primapi.py | 123 +- python/paddle/incubate/autograd/primops.py | 1 + python/paddle/incubate/autograd/primx.py | 46 +- python/paddle/incubate/autograd/utils.py | 10 + 18 files changed, 1099 insertions(+), 2408 deletions(-) delete mode 100644 python/paddle/autograd/functional.py delete mode 100644 python/paddle/autograd/utils.py rename python/paddle/fluid/tests/unittests/autograd/{test_gradients_and_minimize.py => test_minimize.py} (56%) create mode 100644 python/paddle/incubate/autograd/functional.py diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index 6669e4f4c70aa..8bc7b11368680 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -26,8 +26,6 @@ from .py_layer import LegacyPyLayerContext as PyLayerContext # noqa: F401 from ..framework import set_grad_enabled, is_grad_enabled # noqa: F401 from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 -from .functional import vjp, jvp, Jacobian, Hessian # noqa: F401 -from .functional import jacobian, hessian, batch_jacobian, batch_hessian, vhp # noqa: F401 __all__ = [ # noqa 'backward', diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py deleted file mode 100644 index aa3e99978b72a..0000000000000 --- a/python/paddle/autograd/functional.py +++ /dev/null @@ -1,1362 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import functools -import typing - -import paddle -from paddle.fluid import framework -from paddle.autograd.utils import as_tensors - - -def vjp(func, xs, v=None): - r"""Computes the Vector-Jacobian product, a functional form of - reverse mode automatic differentiation. - - Warning: - This API is in beta, the signatures could be changed in future version. - - Args: - func(Callable): A function that takes ``xs`` as inputs parameter and - returns a sequence of Tensors or a Tensor. - xs(Tensor|Sequence[Tensor]): Used as positional arguments to evaluate - ``func``. ``xs`` is accepted as one Tensor or a sequence of Tensors. - v(Tensor|Sequence[Tensor]|None, optional): The cotangent vector invovled - in the VJP computation. ``v`` matches the size and shape of - ``func`` 's output. Defaults to None, which is equivalent to all - ones the same size of ``func`` 's output. - - Returns: - output(tuple): - - - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` . - - vjp(Tensor|tuple[Tensor]): The vjp result. - - Examples: - - .. code-block:: python - - import paddle - - def func(x): - return paddle.matmul(x, x) - - x = paddle.ones(shape=[2, 2], dtype='float32') - _, vjp_result = paddle.incubate.autograd.vjp(func, x) - print(vjp_result) - # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[4., 4.], - # [4., 4.]]) - - v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) - _, vjp_result = paddle.incubate.autograd.vjp(func, x, v) - print(vjp_result) - # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[2., 1.], - # [1., 0.]]) - """ - _check_inputs(func, xs, v) - - # ``_seprate`` breaks the dependencies between ``xs`` and other - # variables. See more ``_seprate`` . - xs, v = _separate(xs), _separate(v) - ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs) - _check_v_shape(v, ys) - - return ys, _grad(ys, xs, v) - - -def jvp(func, xs, v=None): - r""" - Computes the Jacobian-Vector product for a function at the given - inputs and a vector in the tangent space induced by the inputs. - - Warning: - This API is in beta, the signatures could be changed in future version. - - Args: - func(Callable): The ``func`` takes as input a Tensor or a Sequence - of Tensors and returns a Tensor or a Sequence of Tensors. - xs(Tensor|Sequence[Tensor]): Used as positional arguments to - evaluate ``func``. The ``xs`` is accepted as one Tensor or a - Sequence of Tensors. - v(Tensor|Sequence[Tensor]|None, Optional): The tangent vector invovled - in the JVP computation. The ``v`` matches the size and shape of - ``xs`` . Default value is None and in this case is equivalent to - all ones the same size of ``xs`` . - - Returns: - output(tuple): - - - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` . - - jvp(Tensor|tuple[Tensor]): The jvp result. - - Examples: - - .. code-block:: python - - import paddle - - - def func(x): - return paddle.matmul(x, x) - - - x = paddle.ones(shape=[2, 2], dtype='float32') - _, jvp_result = paddle.incubate.autograd.jvp(func, x) - print(jvp_result) - # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[4., 4.], - # [4., 4.]]) - v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) - _, jvp_result = paddle.incubate.autograd.jvp(func, x, v) - print(jvp_result) - # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[2., 1.], - # [1., 0.]]) - - """ - _check_inputs(func, xs, v) - # ``_seprate`` breaks the dependencies between ``xs`` and other - # variables. See more ``_seprate`` . - xs, v = _separate(xs), _separate(v) - ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs) - _check_v_shape(v, xs) - return ys, _double_backward_trick(ys, xs, v) - - -def _double_backward_trick(ys, xs, v): - """Double backward trick for computing ``jvp`` by ``vjp`` - see details: https://j-towns.github.io/2017/06/12/A-new-trick.html - """ - # The value of ys_grad is not important, it can be any random value in - # theory, but it's required to set stop_gradient=False. - ys_grad = _zeros_like_with_grad(ys) - xs_grad = _grad(ys, xs, ys_grad) - return _grad(xs_grad, ys_grad, v) - - -def _zeros_like_with_grad(xs): - """Create a zero or zeros sequence Tensor like ``xs`` with a flag - ``stop_graident=False`` . - """ - if not isinstance(xs, typing.Sequence): - ys = paddle.zeros_like(xs) - ys.stop_gradient = False - else: - ys = [] - for x in xs: - y = paddle.zeros_like(x) - y.stop_gradient = False - ys.append(y) - return ys - - -class Jacobian(object): - r""" - Computes the Jacobian matrix of a given function. - - If the function has multiple inputs and multiple outputs, during internal - implementation, all input tensors are concatenated after being flatten, - the batch dimension is retained, and the output is subject to the same - processing rules. - - Once the Jacobian ``J`` is constructed, you can use a multidimensional index - to retrieve the submatrix of ``J``, as same as slicing a Tensor. The - submatrix is lazily evaluated along row axis, and will be cached once - evaluated. - - For examples, supposing ``is_batched=True``, you can retrieve the submatrix - by following methods: - - * J[:], retrieving the full matrix. - * J[:, :, j], retrieving the partial derivatives w.r.t. the j'th input - variable. - * J[:, i, :], retrieving the partial derivatives w.r.t. the i'th output - variable. - * J[:, i, j], retrieving the partial derivatives w.r.t. the i'th output - variable and the j'th input variable. - - Notes: - - Eclipsis index is not supported currently. - - Warning: - This API is in beta, the signatures could be changed in future version. - - Args: - - func (Callable): A python function that takes a Tensor or a sequence of - Tensors as inputs(the first dimension is batch size) and - returns a Tensor a sequence of Tensors. - xs (Tensor|Sequence[Tensor]): The input to the function ``func`` . - is_batched (bool): If true, the first axis is batch axis. Defaults to - False. - - Returns: - - Jacobian (Object): A python object retains the Jacobian matrix. - - Examples: - - .. code-block:: python - - import paddle - - - def func(x, y): - return paddle.matmul(x, y) - - - x = paddle.to_tensor([[1., 2.], [3., 4.]]) - J = paddle.incubate.autograd.Jacobian(func, [x, x]) - print(J[:, :]) - # Tensor(shape=[4, 8], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[1., 3., 0., 0., 1., 0., 2., 0.], - # [2., 4., 0., 0., 0., 1., 0., 2.], - # [0., 0., 1., 3., 3., 0., 4., 0.], - # [0., 0., 2., 4., 0., 3., 0., 4.]]) - - print(J[0, :]) - # Tensor(shape=[8], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [1., 3., 0., 0., 1., 0., 2., 0.]) - print(J[:, 0]) - # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [1., 2., 0., 0.]) - - """ - - def __init__(self, func, xs, is_batched=False): - if not is_batched: - self._jacobian = _JacobianNoBatch(func, xs) - else: - self._jacobian = _JacobianBatchFirst(func, xs) - - def __getitem__(self, indexes): - return self._jacobian[indexes] - - @property - def shape(self): - """The shape of flattened Jacobian matrix. - """ - return self._jacobian.shape - - -class Hessian(object): - """ - Computes the Hessian matrix with a given ``func`` with respect to ``xs`` . - - If the function has multiple inputs, during internal implementation, - all input tensors are concatenated after being flatten, the batch dimension - is retained. - - The Hessian submatrix is lazily evaluated, and can be retrieved with a - multidimensional indexes. See details ``Jacobian`` . - - Warning: - This API is in beta, the signatures could be changed in future version. - - Args: - func (Callable): A python function that takes a Tensor or a Tensor - sequence as inputs and returns a Tensor with shape - ``[batch_size, 1]`` with batch or ``[1]`` without batch. - xs (Tensor|Sequence(Tensor)): The input Tensor or Tensor sequence of - the function ``func``. - is_batched (bool): If true, the first axis is batch axis. Defaults to - False. - - Returns: - - Hessian (Object): A python object retains the Hessian matrix. - - - Examples: - - .. code-block:: python - - import paddle - - - def reducer(x): - return paddle.sum(x * x) - - - x = paddle.rand([2, 2]) - h = paddle.incubate.autograd.Hessian(reducer, x) - print(h[:]) - # Tensor(shape=[4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=False, - # [[2., 0., 0., 0.], - # [0., 2., 0., 0.], - # [0., 0., 2., 0.], - # [0., 0., 0., 2.]]) - """ - - def __init__(self, func, xs, is_batched=False): - - def _jac_func(*xs): - jac = Jacobian(func, xs, is_batched=is_batched) - if (is_batched and jac.shape[1] != 1) or (not is_batched - and jac.shape[0] != 1): - raise RuntimeError( - "The function given to Hessian shoud return as single element Tensor or batched single element Tensor." - ) - return jac[:, 0, :] if is_batched else jac[0, :] - - self.symbolic = Jacobian(_jac_func, xs, is_batched=is_batched) - - def __getitem__(self, indexes): - return self.symbolic[indexes] - - @property - def shape(self): - """The shape of flattened Hessian matrix. - """ - return self.symbolic.shape - - -class _Jacobian(object): - """The base class for computing Jacobian matrix. - - ``_Jacobian`` implementes the core logic of multidimensional index and lazy - evaluation for Jacobian matrix, subclass only need to overwrite following - methods: - - * ``_lazy_axis()``, return the axis along which will be lazy - evaluating. - * ``_flatten(xs)``, flattens the inputs ``xs``. - * ``_evaluate(index)``, evaluates one slice along ``_lazy_axis`` . - - Notes: - - Because currently PaddlePaddle only support reverse differentiation by - ``paddle.grad``, so lazy evaluation is only supported along the row of - Jacobian matrix, which means that slicing along row will get better - performance. - - """ - - def __init__(self, func, xs): - # Skip separating in prim mode temporarily, as detach and clone are not - # primitive operators. - if not paddle.fluid._non_static_mode( - ) and paddle.incubate.autograd.prim_enabled(): - self._xs = xs - else: - self._xs = _separate(xs) - self._ys = func(*as_tensors(self._xs)) - self._flatten_xs = self._flatten(as_tensors(self._xs)) - self._flatten_ys = self._flatten(as_tensors(self._ys)) - self._cache = {} - - @property - def shape(self): - raise NotImplementedError - - @property - def _lazy_axis(self): - """"The axis of lazily evaluated.""" - raise NotImplementedError - - def _lazy_indexes(self, indexes): - idx = indexes[self._lazy_axis] - return (idx, ) if isinstance(idx, int) else tuple( - range(idx.start, idx.stop, idx.step)) - - def _flatten(self, xs): - raise NotImplementedError - - def _shifted_indexes(self, indexes, lazy_axis_size=0): - idx = indexes[self._lazy_axis] - shifted_lazy_axis_idx = 0 if isinstance(idx, int) else slice( - 0, lazy_axis_size, 1) - return indexes[:self._lazy_axis] + ( - shifted_lazy_axis_idx, ) + indexes[self._lazy_axis + 1:] - - def __getitem__(self, indexes): - indexes = _multi_index(indexes, self.shape) - - if isinstance(indexes[self._lazy_axis], int): - other_indexes = indexes[:self._lazy_axis] + \ - indexes[self._lazy_axis+1:] - return self._cached_evaluate( - indexes[self._lazy_axis])[other_indexes] - lazy_indexes = self._lazy_indexes(indexes) - # Using concat and reshape to replace stack operator temporarily, as - # it is not a primitive operator. - shape = list(self.shape) - shape[self._lazy_axis] = len(lazy_indexes) - part_jac = paddle.concat( - [self._cached_evaluate(i) for i in lazy_indexes], - axis=self._lazy_axis).reshape(shape) - return part_jac[self._shifted_indexes(indexes, len(lazy_indexes))] - - def _cached_evaluate(self, k): - v = self._cache.get(k) - if v is None: - v = self._evaluate(k) - self._cache[k] = v - return v - - def _evaluate(self, index): - """Evaluate one slice at along lazy axis.""" - raise NotImplementedError - - -class _JacobianNoBatch(_Jacobian): - """Compute Jacobian matrix without batch dimension. - Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is - ``(N, M)`` . - """ - - def __init__(self, func, xs): - super(_JacobianNoBatch, self).__init__(func, xs) - - @property - def shape(self): - return (self._flatten_ys.shape[0], self._flatten_xs.shape[0]) - - @property - def _lazy_axis(self): - return 0 - - def _flatten(self, xs): - return paddle.concat(tuple(x.reshape((-1, )) for x in xs)) - - def _evaluate(self, row_index): - return self._flatten(_grad( - self._flatten_ys[row_index], - self._xs, - )) - - -class _JacobianBatchLast(_Jacobian): - """Compute Jacobian matrix with batch at last axis. - Suppose the mapping is :math:`f: R^{M,B} \to R^{N,B}`, the output shape is - ``(N, M, B)`` . - """ - - def __init__(self, func, xs): - super(_JacobianBatchLast, self).__init__(func, xs) - - @property - def shape(self): - return (self._flatten_ys.shape[0], self._flatten_xs.shape[0], - self._flatten_xs.shape[1]) - - @property - def _lazy_axis(self): - return 0 - - def _flatten(self, xs): - return paddle.concat( - tuple(x.reshape((-1, x.shape[-1])) for x in as_tensors(xs)), 0) - - def _evaluate(self, row): - return self._flatten(_grad(self._flatten_ys[row, :], self._xs)) - - -class _JacobianBatchFirst(_Jacobian): - """Compute Jacobian matrix with batch at first axis. - Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is - ``(B, N, M)`` . - """ - - def __init__(self, func, xs): - super(_JacobianBatchFirst, self).__init__(func, xs) - - @property - def shape(self): - return (self._flatten_xs.shape[0], self._flatten_ys.shape[1], - self._flatten_xs.shape[1]) - - @property - def _lazy_axis(self): - return 1 - - def _flatten(self, xs): - return paddle.concat( - tuple(x.reshape((x.shape[0], -1)) for x in as_tensors(xs)), 1) - - def _evaluate(self, row_index): - return self._flatten(_grad(self._flatten_ys[:, row_index], self._xs)) - - -def _multi_index(indexes, shape): - """A tool for parsing N-dimensional index into a standard format. - - Currently supporting following input format: - * ([positive|negative|slice], ...), the right-most elements can be - omited. - - The standard format after converted is slice tuple which contains N elements: - * ([positive|slice], ..., [positive|slice]) - - Notes: - Ellipsis indexes such as ``(..., i), (i, ...)`` is not supported. - - Args: - indexes (tuple): The input indexes. - shape (tuple): The input shape. - - Returns: - tuple: The standard format index as the above description. - """ - indexes = indexes if isinstance(indexes, typing.Sequence) else (indexes, ) - if any(isinstance(i, type(Ellipsis)) for i in indexes): - raise IndexError('Ellipsis index currently is not supported.') - # Fill the right-most elements. - indexes = indexes + (slice(0, None, None), ) * (len(shape) - len(indexes)) - # Convert to positive index. - positive_indexes = [] - for i, index in enumerate(indexes): - if isinstance(index, slice): - index = slice(index.start or 0, index.stop or shape[i], index.step - or 1) - positive_indexes.append( - slice( - index.start + shape[i] if index.start < 0 else index.start, - index.stop + shape[i] if index.stop < 0 else index.stop, - # Negative step means index backward, no need to convert to - # positive interger. - index.step)) - elif isinstance(index, int): - positive_indexes.append(index + shape[i] if index < 0 else index) - else: - raise TypeError(f'Not supported index type {index}.') - return tuple(positive_indexes) - - -def _stack_tensor_or_return_none(origin_list): - assert len(origin_list) > 0, "Can't not stack an empty list" - return paddle.stack(origin_list, axis=0) if isinstance( - origin_list[0], paddle.fluid.framework.Variable) else None - - -def _replace_none_with_zero_tensor(xs, refs): - if xs is None: - xs = paddle.zeros_like(refs) - xs.stop_gradient = refs.stop_gradient - return xs - elif isinstance(xs, typing.Sequence): - return tuple( - _replace_none_with_zero_tensor(x, refs[i]) - for i, x in enumerate(xs)) - else: - return xs - - -def _grad(ys, xs, v=None): - """A gradient function that can be used in dynamic graph and static graph. - - The ``grad`` combines ``paddle.grad`` used in dynamic graph and - ``paddle.static.gradients`` used in static graph, and do following changes: - - * The ``allow_unused`` flag is removed and set defaults to true internally, - none in outputs will be replaced by zero tensor. - * The ``create_graph`` flag is removed and set defaults to true internally, - only makes sense in dynamic graph. - * When xs is a single Tensor, ``paddle.grad`` returns a list which only - contains one Tensor. It may confuse users, thus in this case we improve - to return a single Tensor in _grad interface. - - Args: - ys (Tensor|Sequence[Tensor]): The output tensor or tensor sequence of - the graph to compute gradients. - xs (Tensor|Sequence[Tensor]): The input tensor or tensor sequence of the graph to - compute gradients. The returned values of this API are the - gradients of inputs . - v (Tensor|Sequence[Tensor]|None,optional): The initial gradient values - of outputs . If grad_outputs is None, the initial gradient values of - outputs would be Tensors filled with 1; if grad_outputs is not None, - it must have the same length as outputs , and in this case, the - initial gradient value of the i-th outputs would be: (1) a Tensor - filled with 1 when the i-th element of grad_outputs is None; - (2) the i-th element of grad_outputs when the i-th element of - grad_outputs is a Tensor. Default None. - - Returns: - Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the - same as the Tensor number inside inputs, and the i-th returned - Tensor is the sum of gradients of outputs with respect to the i-th - inputs. - """ - if paddle.fluid._non_static_mode(): - xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True) - else: - xs_grad = paddle.static.gradients(ys, xs, v) - - if isinstance(xs, paddle.fluid.framework.Variable): - xs_grad = xs_grad[0] - - return _replace_none_with_zero_tensor(xs_grad, xs) - - -def _separate(xs): - """ - ``_separate`` separates ``xs`` from the computation graph through ``clone`` - or ``deteach`` . - - Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on - computional graph, which will reduce gradients along all path from ys to xs. - - However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and - only compute gradients with a given ``func`` . - - For example, given a ``func`` :math:`y0=f(x0)`, supposing forward path is: - ``x0 -> y0``, ``x0 -> x1 -> y0`` . - ``paddle.grad(y0, x0)`` will reduce gradients along ``y0->x0`` and - ``y0->x1->x0``, and ``vjp`` only need reduce along ``y0->x0``. - - So, it's needed to clone or detach xs for breaking the dependencies with - other variables. - - Examples: - - .. code-block:: python - - import paddle - from paddle.autograd.functional import _separate - - - def func(x, y): - return x * y - - - x = paddle.ones((1,)) - x.stop_gradient = False - - y = func(x, x) - print(paddle.grad(y, x)) - # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, - # [2.])] - - x1, x2 = _separate((x, x)) - y = func(x1, x2) - print(paddle.grad(y, x1)) - # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, - # [1.])] - - """ - if isinstance(xs, typing.Sequence): - return tuple(_single_separate(x) for x in xs) - else: - return _single_separate(xs) - - -def _single_separate(x): - if x is None: # x maybe none because grad input's v defaults to none. - return x - if not x.stop_gradient: - return paddle.clone(x) - else: # use detach to share memory when no need gradients. - x = x.detach() - x.stop_gradient = False - return x - return x - - -def _check_inputs(func, xs, v=None): - if not callable(func): - raise TypeError(f"Expected 'fun' is Callable, but got {type(func)}.") - - if not isinstance(xs, (framework.Variable, typing.Sequence)): - raise TypeError(f"Expected 'xs' is a Tensor|Sequence[Tensor]," - f"but got {type(xs)}.") - if isinstance(xs, typing.Sequence) and not all( - isinstance(x, framework.Variable) for x in xs): - raise TypeError("All elements of 'xs' shoule be Tensor.") - - if not isinstance(v, (framework.Variable, typing.Sequence, type(None))): - raise TypeError( - f"Expected 'v' is Tensor|Sequence[Tensor]|None, but got {type(v)}.") - - if isinstance(v, typing.Sequence) and not all( - isinstance(e, framework.Variable) for e in v): - raise TypeError("All elements of 'xs' shoule be Tensor.") - - -def _check_v_shape(v, refs): - if v is None: - return - - v, refs = as_tensors(v), as_tensors(refs) - if len(refs) != len(v): - raise RuntimeError(f"The argument v is a tuple of invalid length:" - f"should be {len(refs)} but got {len(v)}.") - - for index, (element_v, element_ref) in enumerate(zip(v, refs)): - if element_v.shape != element_ref.shape: - raise RuntimeError( - f"The v[{index}] has invalid shape: should " - f"be {element_ref.shape} but got {element_v.shape}.") - - -@framework.dygraph_only -def jacobian(func, inputs, create_graph=False, allow_unused=False): - ''' - .. note:: - **This API is ONLY available in the imperative mode.** - - This function computes the Jacobian matrix of `func` with respect to `inputs`. - - Parameters: - func (function): a Python function that takes a Tensor or a Tensor - list/tuple as inputs and returns a Tensor or a Tensor tuple. - inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or - Tensor list/tuple of the function ``func``. - create_graph (bool, optional): whether to create the gradient graphs - of the computing process. When it is True, higher order derivatives - are supported to compute; when it is False, the gradient graphs of - the computing process would be discarded. Defaults to ``False``. - allow_unused (bool, optional): whether to raise error or return None if - some Tensors of `inputs` are unreachable in the graph. Error would - be raised if allow_unused=False, and None would be returned as - their gradients if allow_unused=True. Default False. - Returns: - Jacobian (Tensor or nested tuple of Tensors): if function ``func`` - takes a Tensor as inputs and returns a Tensor as outputs, Jacobian - will be a single Tensor containing the Jacobian matrix for the - linearized inputs and outputs. If one of the inputs and outputs is - a Tensor, and another is a Tensor list/tuple, then the Jacobian will - be a tuple of Tensors. If both of inputs and outputs are Tensor - list/tuple, then the Jacobian will be a tuple of tuple of Tensors - where ``Jacobian[i][j]`` will contain the Jacobian matrix of the - linearized ``i``th output and ``j``th input and will have same - dtype and device as the corresponding input. ``Jacobian[i][j]`` will - have as size ``m * n``, where ``m`` and ``n`` denote the numbers of - elements of ``i``th output and ``j``th input respectively. - - - Examples 1: - .. code-block:: python - - import paddle - - def func(x): - return paddle.matmul(x, x) - - x = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, x) - print(jacobian) - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[2., 1., 1., 0.], - # [1., 2., 0., 1.], - # [1., 0., 2., 1.], - # [0., 1., 1., 2.]]) - - Examples 2: - .. code-block:: python - - import paddle - - def func(x, y): - return paddle.matmul(x, y) - - x = paddle.ones(shape=[2, 2], dtype='float32') - y = paddle.ones(shape=[2, 2], dtype='float32') * 2 - x.stop_gradient = False - y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [x, y], create_graph=True) - print(jacobian) - # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False, - # [[2., 2., 0., 0.], - # [2., 2., 0., 0.], - # [0., 0., 2., 2.], - # [0., 0., 2., 2.]]), - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False, - # [[1., 0., 1., 0.], - # [0., 1., 0., 1.], - # [1., 0., 1., 0.], - # [0., 1., 0., 1.]])) - - Examples 3: - .. code-block:: python - - import paddle - - def func(x, y): - return paddle.matmul(x, y), x * x - - x = paddle.ones(shape=[2, 2], dtype='float32') - y = paddle.ones(shape=[2, 2], dtype='float32') * 2 - x.stop_gradient = False - y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [x, y], allow_unused=True) - print(jacobian) - # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[2., 2., 0., 0.], - # [2., 2., 0., 0.], - # [0., 0., 2., 2.], - # [0., 0., 2., 2.]]), - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[1., 0., 1., 0.], - # [0., 1., 0., 1.], - # [1., 0., 1., 0.], - # [0., 1., 0., 1.]])), - # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 0., 0.], - # [0., 2., 0., 0.], - # [0., 0., 2., 0.], - # [0., 0., 0., 2.]]), None)) - - ''' - inputs = as_tensors(inputs) - outputs = as_tensors(func(*inputs)) - fin_size = len(inputs) - fout_size = len(outputs) - flat_outputs = tuple( - paddle.reshape(output, shape=[-1]) for output in outputs) - jacobian = tuple() - for i, flat_output in enumerate(flat_outputs): - jac_i = list([] for _ in range(fin_size)) - for k in range(len(flat_output)): - row_k = paddle.grad(flat_output[k], - inputs, - create_graph=create_graph, - retain_graph=True, - allow_unused=allow_unused) - for j in range(fin_size): - jac_i[j].append( - paddle.reshape(row_k[j], shape=[-1]) if isinstance( - row_k[j], paddle.Tensor) else None) - jacobian += (tuple( - _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), ) - if fin_size == 1 and fout_size == 1: - return jacobian[0][0] - elif fin_size == 1 and fout_size != 1: - return tuple(jacobian[i][0] for i in range(fout_size)) - elif fin_size != 1 and fout_size == 1: - return jacobian[0] - else: - return jacobian - - -@framework.dygraph_only -def batch_jacobian(func, inputs, create_graph=False, allow_unused=False): - ''' - .. note:: - **This API is ONLY available in the imperative mode.** - - This function computes the batch Jacobian matrix of `func` with respect to `inputs`. - Noted that the first dimension of inputs is batch size. - - Parameters: - func (function): a Python function that takes a Tensor or a Tensor - list/tuple as inputs(the first dimension is batch size) and - returns a Tensor or a Tensor tuple. - inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or - Tensor list/tuple of the function ``func``, Noted that - the first dimension of inputs is batch size. - create_graph (bool, optional): whether to create the gradient graphs - of the computing process. When it is True, higher order derivatives - are supported to compute; when it is False, the gradient graphs of - the computing process would be discarded. Defaults to ``False``. - allow_unused (bool, optional): whether to raise error or return None if - some Tensors of `inputs` are unreachable in the graph. Error would - be raised if allow_unused=False, and None would be returned as - their gradients if allow_unused=True. Default False. - Returns: - Jacobian (Tensor or nested tuple of Tensors): if function ``func`` - takes a Tensor as inputs and returns a Tensor as outputs, Jacobian - will be a single Tensor containing the Jacobian matrix for the - linearized inputs and outputs. If one of the inputs and outputs is - a Tensor, and another is a Tensor list/tuple, then the Jacobian will - be a tuple of Tensors. If both of inputs and outputs are Tensor - list/tuple, then the Jacobian will be a tuple of tuple of Tensors. - Noted that the first dimension of inputs is batch size. - - For example, - the inputs shape and outputs shape of function ``func` is [batch_size, num] - and [batch_size, num] respectively, then the Jacobian will be a Tensor with - a shape of [num, batch_size * num], where ``Jacobian[i][j]`` will contain - the Jacobian matrix of the ``i``th column output and the ``j``th input and - will have same dtype and device as the corresponding input. - Other situations can be deduced by analogy. - - Examples 1: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x): - return paddle.matmul(paddle.matmul(x, weight), y) - - x.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian(func, x) - print(batch_jacobian) - # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[4., 4., 4., 4., 4., 4., 4., 4.], - # [4., 4., 4., 4., 4., 4., 4., 4.]]) - - Examples 2: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x): - return paddle.matmul(paddle.matmul(x, weight), y), x * x - - x.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian(func, x) - print(batch_jacobian) - # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[4., 4., 4., 4., 4., 4., 4., 4.], - # [4., 4., 4., 4., 4., 4., 4., 4.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 2., 0., 2., 0., 2., 0.], - # [0., 2., 0., 2., 0., 2., 0., 2.]])) - - Examples 3: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x, y): - return x * y - - x.stop_gradient = False - y.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian(func, [x, y]) - print(batch_jacobian) - # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[1., 0., 1., 0., 1., 0., 1., 0.], - # [0., 1., 0., 1., 0., 1., 0., 1.]]), Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[1., 0., 1., 0., 1., 0., 1., 0.], - # [0., 1., 0., 1., 0., 1., 0., 1.]])) - - ''' - - inputs = as_tensors(inputs) - outputs = as_tensors(func(*inputs)) - - batch_size = inputs[0].shape[0] - for input in inputs: - assert input.shape[ - 0] == batch_size, "The first dimension of input should equals to the same batch size!" - for output in outputs: - assert output.shape[ - 0] == batch_size, "The first dimension of output should equals to the same batch size!" - fin_size = len(inputs) - fout_size = len(outputs) - flat_outputs = tuple( - paddle.reshape(output, shape=[batch_size, -1]) for output in outputs) - jacobian = tuple() - for i, flat_output in enumerate(flat_outputs): - jac_i = list([] for _ in range(fin_size)) - for k in range(flat_output.shape[1]): - - row_k = paddle.grad(flat_output[:, k], - inputs, - create_graph=create_graph, - retain_graph=True, - allow_unused=allow_unused) - - for j in range(fin_size): - jac_i[j].append( - paddle.reshape(row_k[j], shape=[-1]) if isinstance( - row_k[j], paddle.Tensor) else None) - jacobian += (tuple( - _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), ) - if fin_size == 1 and fout_size == 1: - return jacobian[0][0] - elif fin_size == 1 and fout_size != 1: - return tuple(jacobian[i][0] for i in range(fout_size)) - elif fin_size != 1 and fout_size == 1: - return jacobian[0] - else: - return jacobian - - -@framework.dygraph_only -def batch_hessian(func, inputs, create_graph=False, allow_unused=False): - ''' - .. note:: - **This API is ONLY available in the imperative mode.** - - This function computes the batch Hessian matrix of `func` with respect to `inputs`. - Noted that the first dimension of inputs is batch size. - - Parameters: - func (function): a Python function that takes a Tensor or a Tensor - list/tuple as inputs(the first dimension is batch size) and - returns a Tensor with shape [batch_size, 1]. - inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or - Tensor list/tuple of the function ``func``. - Noted that the first dimension of inputs is batch size. - create_graph (bool, optional): whether to create the gradient graphs - of the computing process. When it is True, higher order derivatives - are supported to compute; when it is False, the gradient graphs of - the computing process would be discarded. Defaults to ``False``. - allow_unused (bool, optional): whether to raise error or return None if - some Tensors of `inputs` are unreachable in the graph. Error would - be raised if allow_unused=False, and None would be returned as - their gradients if allow_unused=True. Default False. - Returns: - Hessian (Tensor or a tuple of tuple of Tensors): if function ``func`` - takes a Tensor as ``inputs``, Hessian will be a single Tensor containing - the Hessian matrix for the linearized ``inputs`` Tensor. If function - ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will - be a tuple of tuple of Tensors. Noted that the first dimension of inputs - is batch size and the execution step is to obtain the result of the - first order differentiation, and then differentiate the batch input. - - For example, - the inputs shape and outputs shape of function ``func` is [batch_size, num] - and [batch_size, 1] respectively, then the batched Hessian will be a Tensor with - a shape of [num, batch_size * num]. - - Why the final shape in this case is that? - because batch_hessian will create a inner func(the wrapper of paddle.grad() func) - to computes the sum of gradients of `outputs` with respect to each `inputs`, - this inner func will get the first order differentiation and shape is [batch_size, num], - then call batch_jacobian to compute jacobian between the first order differentiation - and the origin inputs. The final result ``Hessian[i][j]`` will contain the Jacobian - matrix of the ``i``th column output(Noted that this output means the first order - differentiation) and the ``j``th input and will have same dtype and device as the - corresponding input. Other situations can be deduced by analogy. - - - Examples 1: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x): - return paddle.matmul(x * x, weight)[:, 0:1] - - - x.stop_gradient = False - batch_hessian = paddle.autograd.batch_hessian(func, x) - print(batch_hessian) - # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 2., 0., 2., 0., 2., 0.], - # [0., 2., 0., 2., 0., 2., 0., 2.]]) - - Examples 2: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x, y): - return paddle.matmul(x * x * y * y, weight)[:, 0:1] - - x.stop_gradient = False - y.stop_gradient = False - batch_hessian = paddle.autograd.batch_hessian(func, [x, y]) - print(batch_hessian) - # ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 2., 0., 2., 0., 2., 0.], - # [0., 2., 0., 2., 0., 2., 0., 2.]]), - # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[4., 0., 4., 0., 4., 0., 4., 0.], - # [0., 4., 0., 4., 0., 4., 0., 4.]])), - # (Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[4., 0., 4., 0., 4., 0., 4., 0.], - # [0., 4., 0., 4., 0., 4., 0., 4.]]), - # Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 2., 0., 2., 0., 2., 0.], - # [0., 2., 0., 2., 0., 2., 0., 2.]]))) - - - Examples 3: - .. code-block:: python - - import paddle - - x = paddle.ones(shape=(4, 2), dtype='float64') - weight = paddle.ones(shape=(2, 4), dtype='float64') - y = paddle.ones(shape=(4, 2), dtype='float64') - - def func(x, y): - return paddle.matmul(x * x, weight)[:, 0:1] - - x.stop_gradient = False - y.stop_gradient = False - batch_hessian = paddle.autograd.batch_hessian(func, [x, y], allow_unused=True) - print(batch_hessian) - # ((Tensor(shape=[2, 8], dtype=float64, place=CUDAPlace(0), stop_gradient=True, - # [[2., 0., 2., 0., 2., 0., 2., 0.], - # [0., 2., 0., 2., 0., 2., 0., 2.]]), None), (None, None)) - - ''' - inputs = as_tensors(inputs) - outputs = func(*inputs) - batch_size = inputs[0].shape[0] - for input in inputs: - assert input.shape[ - 0] == batch_size, "The first dimension of input should equals to the same batch size!" - assert isinstance(outputs, paddle.Tensor) and outputs.shape == [ - batch_size, 1 - ], "The function to compute batched Hessian matrix should return a Tensor of shape [batch_size, 1]" - - def jac_func(*ins): - grad_inputs = paddle.grad(outputs, - ins, - create_graph=True, - retain_graph=True, - allow_unused=allow_unused) - return tuple( - _replace_none_with_zero_tensor(grad_inputs[i], inputs[i]) - for i in range(len(inputs))) - - return batch_jacobian(jac_func, - inputs, - create_graph=create_graph, - allow_unused=allow_unused) - - -@framework.dygraph_only -def hessian(func, inputs, create_graph=False, allow_unused=False): - ''' - .. note:: - **This API is ONLY available in the imperative mode.** - - This function computes the Hessian matrix of `func` with respect to `inputs`. - - Parameters: - func (function): a Python function that takes a Tensor or a Tensor - list/tuple as inputs and returns a Tensor with a single element. - inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or - Tensor list/tuple of the function ``func``. - create_graph (bool, optional): whether to create the gradient graphs - of the computing process. When it is True, higher order derivatives - are supported to compute; when it is False, the gradient graphs of - the computing process would be discarded. Defaults to ``False``. - allow_unused (bool, optional): whether to raise error or return None if - some Tensors of `inputs` are unreachable in the graph. Error would - be raised if allow_unused=False, and None would be returned as - their gradients if allow_unused=True. Default False. - Returns: - Hessian (Tensor or a tuple of tuple of Tensors): if function ``func`` - takes a Tensor as ``inputs``, Hessian will be a single Tensor containing - the Hessian matrix for the linearized ``inputs`` Tensor. If function - ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will - be a tuple of tuple of Tensors where ``Hessian[i][j]`` will contain the - Hessian matrix of the ``i``th input and ``j``th input with size ``m * n``. - Here ``m`` and ``n`` denote the number of elements of the ``i`` th input - and the ``j`` th input respectively. - - Examples 1: - .. code-block:: python - - import paddle - - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - hessian = paddle.autograd.hessian(func, x) - print(hessian) - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[2., 1., 1., 0.], - # [1., 0., 2., 1.], - # [1., 2., 0., 1.], - # [0., 1., 1., 2.]]) - - Examples 2: - .. code-block:: python - - import paddle - - def func(x, y): - return paddle.sum(paddle.matmul(x, y)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - y = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - y.stop_gradient = False - hessian = paddle.autograd.hessian(func, [x, y]) - print(hessian) - # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]), - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[1., 1., 0., 0.], - # [0., 0., 1., 1.], - # [1., 1., 0., 0.], - # [0., 0., 1., 1.]])), - # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[1., 0., 1., 0.], - # [1., 0., 1., 0.], - # [0., 1., 0., 1.], - # [0., 1., 0., 1.]]), - # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]))) - - Examples 3: - .. code-block:: python - - import paddle - - def func(x, y): - return paddle.sum(paddle.matmul(x, x)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - y = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - y.stop_gradient = False - hessian = paddle.autograd.hessian(func, [x, y], allow_unused=True) - print(hessian) - # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[2., 1., 1., 0.], - # [1., 0., 2., 1.], - # [1., 2., 0., 1.], - # [0., 1., 1., 2.]]), None), (None, None)) - - ''' - inputs = as_tensors(inputs) - outputs = func(*inputs) - assert isinstance(outputs, paddle.Tensor) and outputs.shape == [ - 1 - ], "The function to compute Hessian matrix should return a Tensor with a single element" - - def jac_func(*ins): - grad_inputs = paddle.grad(outputs, - ins, - create_graph=True, - retain_graph=True, - allow_unused=allow_unused) - return tuple( - _replace_none_with_zero_tensor(grad_inputs[i], inputs[i]) - for i in range(len(inputs))) - - return jacobian(jac_func, - inputs, - create_graph=create_graph, - allow_unused=allow_unused) - - -def vhp(func, inputs, v=None, create_graph=False, allow_unused=False): - ''' - .. note:: - **This API is ONLY available in the imperative mode.** - - This function computes the product between a vector ``v`` and the - Hessian matrix of `func` with respect to `inputs`. - - Parameters: - func (function): a Python function that takes a Tensor or a Tensor - list/tuple as inputs and returns a Tensor with a single element. - inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or - Tensor list/tuple of the function ``func``. - v (Tensor|list(Tensor)|tuple(Tensor)|None, optional): the vector used - to compute vector hessian product. ``v`` should have same shape - and dtype with ``inputs``. If ``v`` is None, it will be set as - Tensor|list(Tensor) with all elements 1. Defaults to "None". - create_graph (bool, optional): whether to create the gradient graphs - of the computing process. When it is True, higher order derivatives - are supported to compute; when it is False, the gradient graphs of - the computing process would be discarded. Defaults to ``False``. - allow_unused (bool, optional): whether to raise error or return None if - some Tensors of `inputs` are unreachable in the graph. Error would - be raised if allow_unused=False, and None would be returned as - their gradients if allow_unused=True. Default False. - Returns: - output (tuple): tuple with: - func_output (Tensor): output of ``func(inputs)`` - vhp (list(Tensor)): result of the vector hessian product - with the same shape and dtype as the inputs. - Examples 1: - .. code-block:: python - import paddle - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - vx = paddle.ones(shape=[2, 2], dtype='float32') * 2 - vhp_rslt = paddle.autograd.vhp(func, x, v=vx) - print(vhp_rslt) - # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, - # [8.]), - # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[8., 8.], - # [8., 8.]])) - - Examples 2: - .. code-block:: python - import paddle - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - vhp_rslt = paddle.autograd.vhp(func, x) - print(vhp_rslt) - # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, - # [8.]), - # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[4., 4.], - # [4., 4.]])) - - Examples 3: - .. code-block:: python - import paddle - def func(x, y): - return paddle.sum(paddle.matmul(x, x)) - - x = paddle.ones(shape=[2, 2], dtype='float32') - x.stop_gradient = False - y = paddle.ones(shape=[2, 2], dtype='float32') - y.stop_gradient = False - vx = paddle.ones(shape=[2, 2], dtype='float32') * 2 - vy = paddle.ones(shape=[2, 2], dtype='float32') * 3 - vhp_rslt = paddle.autograd.vhp(func, [x, y], v=[vx, vy], allow_unused=True) - print(vhp_rslt) - # (Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, - # [8.]), - # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, - # [[8., 8.], - # [8., 8.]]), None]) - ''' - xs = as_tensors(inputs) - if v is not None: - v = as_tensors(v) - xs, v = _separate(xs), _separate(v) - outputs = func(*xs) - ys = as_tensors(outputs) - assert len(ys) == 1 and isinstance( - ys[0], framework.Variable - ) and ys[0].shape == [ - 1 - ], "The function to compute vhp should return a Tensor with a single element" - jac = _grad(ys, xs) - vhp = _grad(jac, xs, v) - return outputs, vhp diff --git a/python/paddle/autograd/utils.py b/python/paddle/autograd/utils.py deleted file mode 100644 index 6b8865f4d7df0..0000000000000 --- a/python/paddle/autograd/utils.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import typing - -from paddle.fluid import framework - - -def as_tensors(xs): - if isinstance(xs, framework.Variable): - return (xs, ) - elif isinstance(xs, typing.Sequence): - return tuple(xs) - else: - return xs diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index c37ac87da71b8..5ed01a0114421 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -2211,12 +2211,6 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None): check_type(target_gradients, 'target_gradients', (framework.Variable, list, tuple, type(None)), 'paddle.static.gradients') - - from ..incubate.autograd.primx import _gradients - from ..incubate.autograd.utils import prim_enabled - if prim_enabled(): - return _gradients(targets, inputs, target_gradients) - outs = calc_gradient(targets, inputs, target_gradients, no_grad_set) return _as_list(outs) diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt index 832ecc61ee190..45c0a08efe828 100644 --- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt @@ -17,7 +17,7 @@ endforeach() set_tests_properties(test_autograd_functional_dynamic PROPERTIES TIMEOUT 200) set_tests_properties(test_autograd_functional_static PROPERTIES TIMEOUT 160) -set_tests_properties(test_gradients_and_minimize PROPERTIES TIMEOUT 60) +set_tests_properties(test_minimize PROPERTIES TIMEOUT 60) if(NOT WIN32) set_tests_properties(test_autograd_functional_prim PROPERTIES TIMEOUT 60) endif() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py index a98b509f963c7..6c67b78d6a539 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py @@ -21,7 +21,7 @@ import paddle.fluid as fluid import paddle.compat as cpt import paddle.nn.functional as F -from paddle.autograd.utils import as_tensors +from paddle.incubate.autograd.utils import as_tensors from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph, _in_eager_without_dygraph_check import config @@ -78,9 +78,9 @@ def vjp_test(): xs = self.gen_inputs(inputs) if v is not None: v = self.gen_inputs(v) - outputs, inputs_grad = paddle.autograd.vjp(func, xs, v) + outputs, inputs_grad = paddle.incubate.autograd.vjp(func, xs, v) else: - outputs, inputs_grad = paddle.autograd.vjp(func, xs) + outputs, inputs_grad = paddle.incubate.autograd.vjp(func, xs) return outputs, inputs_grad def grad_test(): @@ -116,14 +116,14 @@ def jvp_test(): xs = self.gen_inputs(inputs) if v is not None: v = self.gen_inputs(v) - outputs, outputs_grad = paddle.autograd.jvp( + outputs, outputs_grad = paddle.incubate.autograd.jvp( func, xs, v, create_graph=create_graph, allow_unused=allow_unused) else: - outputs, outputs_grad = paddle.autograd.jvp( + outputs, outputs_grad = paddle.incubate.autograd.jvp( func, xs, create_graph=create_graph, @@ -233,8 +233,8 @@ class TestVJPException(unittest.TestCase): def func_vjp(self): with self.assertRaises(self.expected_exception): - paddle.autograd.vjp(self.fun, paddle.to_tensor(self.xs), - paddle.to_tensor(self.v)) + paddle.incubate.autograd.vjp(self.fun, paddle.to_tensor(self.xs), + paddle.to_tensor(self.v)) def test_all_cases(self): with _test_eager_guard(): @@ -243,8 +243,10 @@ def test_all_cases(self): def jac(grad_fn, f, inputs): - assert grad_fn in [paddle.autograd.vjp, paddle.autograd.jvp] - if grad_fn is paddle.autograd.jvp: + assert grad_fn in [ + paddle.incubate.autograd.vjp, paddle.incubate.autograd.jvp + ] + if grad_fn is paddle.incubate.autograd.jvp: vs = [paddle.zeros_like(x) for x in inputs] else: outputs = f(*inputs) @@ -265,7 +267,7 @@ def jac(grad_fn, f, inputs): JJ_cols.append(d_outs) # JJ is the fully unrolled jacobian JJ = paddle.stack(JJ_cols) - if grad_fn is paddle.autograd.vjp: + if grad_fn is paddle.incubate.autograd.vjp: JJ = JJ.t() return JJ @@ -279,8 +281,8 @@ def func_jvp_i1o1(self): ] # noqa for f, inputs in test_cases: inputs = self.gen_inputs(inputs) - forward_jac = jac(paddle.autograd.jvp, f, inputs) - reverse_jac = jac(paddle.autograd.vjp, f, inputs) + forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs) + reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs) self.check_results(forward_jac, reverse_jac) def func_jvp_i2o1(self): @@ -289,8 +291,8 @@ def func_jvp_i2o1(self): ] # noqa for f, inputs in test_cases: inputs = self.gen_inputs(inputs) - forward_jac = jac(paddle.autograd.jvp, f, inputs) - reverse_jac = jac(paddle.autograd.vjp, f, inputs) + forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs) + reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs) self.check_results(forward_jac, reverse_jac) def func_jvp_i2o2(self): @@ -299,8 +301,8 @@ def func_jvp_i2o2(self): ] # noqa for f, inputs in test_cases: inputs = self.gen_inputs(inputs) - forward_jac = jac(paddle.autograd.jvp, f, inputs) - reverse_jac = jac(paddle.autograd.vjp, f, inputs) + forward_jac = jac(paddle.incubate.autograd.jvp, f, inputs) + reverse_jac = jac(paddle.incubate.autograd.vjp, f, inputs) self.check_results(forward_jac, reverse_jac) def func_jvp_i2o2_omitting_v(self): @@ -309,9 +311,9 @@ def func_jvp_i2o2_omitting_v(self): ] # noqa for f, inputs in test_cases: inputs = self.gen_inputs(inputs) - results_omitting_v = paddle.autograd.jvp(f, inputs) + results_omitting_v = paddle.incubate.autograd.jvp(f, inputs) v = [paddle.ones_like(x) for x in inputs] - results_with_v = paddle.autograd.jvp(f, inputs, v) + results_with_v = paddle.incubate.autograd.jvp(f, inputs, v) self.check_results(results_omitting_v, results_with_v) def test_all_cases(self): @@ -334,7 +336,7 @@ def test_all_cases(self): ('multi_in_single_out', paddle.matmul, (np.random.rand(2, 2), np.random.rand(2, 2))), )) -class TestJacobianClassNoBatch(unittest.TestCase): +class TestJacobianNoBatch(unittest.TestCase): def setUp(self): self._dtype = self.xs[0].dtype if isinstance( @@ -349,7 +351,7 @@ def setUp(self): def func_jacobian(self): xs = [paddle.to_tensor(x) for x in self.xs] if isinstance( self.xs, typing.Sequence) else paddle.to_tensor(self.xs) - self._actual = paddle.autograd.Jacobian(self.func, xs, False) + self._actual = paddle.incubate.autograd.Jacobian(self.func, xs, False) self._expected = self._get_expected() Index = collections.namedtuple('Index', ('type', 'value')) @@ -387,7 +389,7 @@ def test_all_cases(self): ('3d_in_3d_out', utils.square, np.random.rand(2, 3, 4)), ('multi_in_single_out', utils.square, np.random.rand(2, 3)), )) -class TestJacobianClassBatchFirst(unittest.TestCase): +class TestJacobianBatchFirst(unittest.TestCase): def setUp(self): self._dtype = self.xs[0].dtype if isinstance( @@ -402,7 +404,7 @@ def setUp(self): def func_jacobian(self): xs = [paddle.to_tensor(x) for x in self.xs] if isinstance( self.xs, typing.Sequence) else paddle.to_tensor(self.xs) - self._actual = paddle.autograd.Jacobian(self.func, xs, True) + self._actual = paddle.incubate.autograd.Jacobian(self.func, xs, True) self._expected = self._get_expected() Index = collections.namedtuple('Index', ('type', 'value')) @@ -444,7 +446,7 @@ def test_all_cases(self): self.func_jacobian() -class TestHessianClassNoBatch(unittest.TestCase): +class TestHessianNoBatch(unittest.TestCase): @classmethod def setUpClass(self): @@ -470,7 +472,7 @@ def func(x): numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian) self.x.stop_gradient = False - hessian = paddle.autograd.Hessian(func, self.x) + hessian = paddle.incubate.autograd.Hessian(func, self.x) np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian, self.rtol, self.atol) @@ -484,7 +486,7 @@ def func(x, y): numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian) self.x.stop_gradient = False self.y.stop_gradient = False - hessian = paddle.autograd.Hessian(func, [self.x, self.y]) + hessian = paddle.incubate.autograd.Hessian(func, [self.x, self.y]) np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian, rtol=self.rtol, @@ -500,7 +502,7 @@ def func(x, y): numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian) self.x.stop_gradient = False self.y.stop_gradient = False - hessian = paddle.autograd.Hessian(func, [self.x, self.y]) + hessian = paddle.incubate.autograd.Hessian(func, [self.x, self.y]) np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian, self.rtol, self.atol) @@ -514,7 +516,7 @@ def func(x): func, self.x, self.numerical_delta, self.np_dtype) numerical_hessian = utils._np_concat_matrix_sequence(numerical_hessian) self.x.stop_gradient = False - hessian = paddle.autograd.Hessian(func, self.x) + hessian = paddle.incubate.autograd.Hessian(func, self.x) assert hessian[:].stop_gradient == False np.testing.assert_allclose(hessian[:].numpy(), numerical_hessian, self.rtol, self.atol) @@ -526,7 +528,7 @@ def func(x): return x * x with self.assertRaises(RuntimeError): - paddle.autograd.Hessian(func, paddle.ones([3])) + paddle.incubate.autograd.Hessian(func, paddle.ones([3])) def test_all_cases(self): with _test_eager_guard(): @@ -544,7 +546,7 @@ def test_all_cases(self): self.func_out_not_single() -class TestHessianClassBatchFirst(unittest.TestCase): +class TestHessianBatchFirst(unittest.TestCase): @classmethod def setUpClass(self): @@ -572,7 +574,7 @@ def func(x): expected = utils._compute_numerical_batch_hessian( func, self.x, self.numerical_delta, self.np_dtype) - H = paddle.autograd.Hessian(func, self.x, is_batched=True) + H = paddle.incubate.autograd.Hessian(func, self.x, is_batched=True) actual = utils._np_transpose_matrix_format(H[:].numpy(), utils.MatrixFormat.BNM, utils.MatrixFormat.NBM) @@ -596,7 +598,8 @@ def func(x, y): self.x.stop_gradient = False self.y.stop_gradient = False - H = paddle.autograd.Hessian(func, [self.x, self.y], is_batched=True) + H = paddle.incubate.autograd.Hessian(func, [self.x, self.y], + is_batched=True) actual = utils._np_transpose_matrix_format(H[:].numpy(), utils.MatrixFormat.BNM, utils.MatrixFormat.NBM) @@ -620,8 +623,8 @@ def func(x, y): utils.MatrixFormat.NBM, utils.MatrixFormat.BNM) - actual = paddle.autograd.Hessian(func, [self.x, self.y], - is_batched=True)[:] + actual = paddle.incubate.autograd.Hessian(func, [self.x, self.y], + is_batched=True)[:] np.testing.assert_allclose(actual, expected, @@ -638,7 +641,7 @@ def func(x): x = self.x.clone() x.stop_gradient = True - H = paddle.autograd.Hessian(func, self.x, is_batched=True)[:] + H = paddle.incubate.autograd.Hessian(func, self.x, is_batched=True)[:] actual = utils._np_transpose_matrix_format(H[:].numpy(), utils.MatrixFormat.BNM, utils.MatrixFormat.NBM) @@ -652,7 +655,9 @@ def func(x): return (x * x) with self.assertRaises(RuntimeError): - paddle.autograd.Hessian(func, paddle.ones((3, 3)), is_batched=True) + paddle.incubate.autograd.Hessian(func, + paddle.ones((3, 3)), + is_batched=True) def test_all_cases(self): with _test_eager_guard(): @@ -670,829 +675,5 @@ def test_all_cases(self): self.func_out_not_single() -class TestHessian(unittest.TestCase): - - @classmethod - def setUpClass(self): - self.shape = (2, 2) - self.dtype = 'float32' - self.np_dtype = np.float32 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("eps") - self.rtol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("rtol") - self.atol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("atol") - - self.x = paddle.rand(shape=self.shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - - def func_single_input(self): - - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - numerical_hessian = _compute_numerical_hessian(func, self.x, - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - hessian = paddle.autograd.hessian(func, self.x) - np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0], - self.rtol, self.atol) - - def func_multi_input(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, y)) - - numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y], - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.hessian(func, [self.x, self.y]) - for i in range(len(hessian)): - for j in range(len(hessian[0])): - np.testing.assert_allclose(hessian[i][j].numpy(), - numerical_hessian[i][j], self.rtol, - self.atol) - - def func_allow_unused_false(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, x)) - - try: - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.hessian(func, [self.x, self.y]) - except ValueError as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("allow_unused") > 0 - - def func_allow_unused_true(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, x)) - - numerical_hessian = _compute_numerical_hessian(func, [self.x, self.y], - self.numerical_delta, - self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.hessian(func, [self.x, self.y], - allow_unused=True) - for i in range(len(hessian)): - for j in range(len(hessian[0])): - if i == j == 0: - np.testing.assert_allclose(hessian[i][j].numpy(), - numerical_hessian[i][j], - self.rtol, self.atol) - else: - assert hessian[i][j] is None - - def func_create_graph_false(self): - - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - numerical_hessian = _compute_numerical_hessian(func, self.x, - self.numerical_delta, - self.np_dtype) - self.x.stop_gradient = False - hessian = paddle.autograd.hessian(func, self.x) - assert hessian.stop_gradient == True - np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0], - self.rtol, self.atol) - try: - paddle.grad(hessian, self.x) - except Exception as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 or error_msg.find( - "does not appear") > 0 - - def func_create_graph_true(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - - def func(x): - return paddle.sum(F.sigmoid(x)) - - numerical_hessian = _compute_numerical_hessian(func, self.x, - self.numerical_delta, - self.np_dtype) - self.x.stop_gradient = False - hessian = paddle.autograd.hessian(func, self.x, create_graph=True) - assert hessian.stop_gradient == False - np.testing.assert_allclose(hessian.numpy(), numerical_hessian[0][0], - self.rtol, self.atol) - triple_grad = paddle.grad(hessian, self.x) - assert triple_grad is not None - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) - - def test_all_cases(self): - with _test_eager_guard(): - self.setUpClass() - self.func_single_input() - self.func_multi_input() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - self.setUpClass() - self.func_single_input() - self.func_multi_input() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - - -class TestHessianFloat64(TestHessian): - - @classmethod - def setUpClass(self): - self.shape = (2, 2) - self.dtype = 'float64' - self.np_dtype = np.float64 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("eps") - self.rtol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("rtol") - self.atol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("atol") - self.x = paddle.rand(shape=self.shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - - -class TestBatchHessian(unittest.TestCase): - - @classmethod - def setUpClass(self): - self.x_shape = (5, 2) - self.weight_shape = (2, 4) - self.y_shape = (5, 2) - self.dtype = 'float32' - self.np_dtype = np.float32 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("eps") - self.rtol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("rtol") - self.atol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("atol") - self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype) - self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype) - - def func_single_input(self): - - def func(x): - return paddle.matmul(x * x, self.weight)[:, 0:1] - - numerical_hessian = _compute_numerical_batch_hessian( - func, self.x, self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True) - np.testing.assert_allclose(hessian, numerical_hessian, self.rtol, - self.atol) - - def func_multi_input(self): - - def func(x, y): - return paddle.matmul(x * x * y * y, self.weight)[:, 0:1] - - numerical_hessian = _compute_numerical_batch_hessian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, [self.x, self.y]) - - shape_tensor = paddle.to_tensor(numerical_hessian).astype("float64") - hessian_reshape = np.reshape(hessian, (shape_tensor.shape)) - np.testing.assert_allclose(hessian_reshape, numerical_hessian, - self.rtol, self.atol) - - def func_allow_unused_false(self): - - def func(x, y): - return paddle.matmul(x * x, self.weight)[:, 0:1] - - try: - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, [self.x, self.y]) - except ValueError as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("allow_unused") > 0 - - def func_allow_unused_true(self): - - def func(x, y): - return paddle.matmul(x * x, self.weight)[:, 0:1] - - numerical_hessian = _compute_numerical_batch_hessian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, [self.x, self.y], - allow_unused=True) - - for i in range(len(hessian)): - for j in range(len(hessian[0])): - if i == j == 0: - numerical_hessian = np.stack( - (numerical_hessian[i][j], numerical_hessian[i][j + 1]), - axis=0) - np.testing.assert_allclose(hessian[i][j], numerical_hessian, - self.rtol, self.atol) - else: - assert hessian[i][j] is None - - def func_create_graph_false(self): - - def func(x): - return paddle.matmul(x * x, self.weight)[:, 0:1] - - numerical_hessian = _compute_numerical_batch_hessian( - func, self.x, self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, self.x) - assert hessian.stop_gradient == True - np.testing.assert_allclose(hessian.numpy(), numerical_hessian, - self.rtol, self.atol) - try: - paddle.grad(hessian, self.x) - except Exception as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 or error_msg.find( - "does not appear") > 0 - - def func_create_graph_true(self): - - def func(x): - return paddle.matmul(x * x, self.weight)[:, 0:1] - - numerical_hessian = _compute_numerical_batch_hessian( - func, self.x, self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - hessian = paddle.autograd.batch_hessian(func, self.x, create_graph=True) - assert hessian.stop_gradient == False - np.testing.assert_allclose(hessian.numpy(), numerical_hessian, - self.rtol, self.atol) - triple_grad = paddle.grad(hessian, self.x) - assert triple_grad is not None - - def test_all_cases(self): - with _test_eager_guard(): - self.setUpClass() - self.func_single_input() - self.func_multi_input() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - self.setUpClass() - self.func_single_input() - self.func_multi_input() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - - -class TestBatchHessianFloat64(TestBatchHessian): - - @classmethod - def setUpClass(self): - self.x_shape = (5, 2) - self.weight_shape = (2, 4) - self.y_shape = (5, 2) - self.dtype = 'float64' - self.np_dtype = np.float64 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("eps") - self.rtol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("rtol") - self.atol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("atol") - self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype) - self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype) - - -class TestVHP(unittest.TestCase): - - @classmethod - def setUpClass(self): - self.shape = (2, 2) - self.dtype = 'float32' - self.np_dtype = np.float32 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("eps") - self.rtol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("rtol") - self.atol = config.TOLERANCE.get( - self.dtype).get("second_order_grad").get("atol") - self.x = paddle.rand(shape=self.shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - self.vx = paddle.rand(shape=self.shape, dtype=self.dtype) - self.vy = paddle.rand(shape=self.shape, dtype=self.dtype) - - def func_single_input(self): - - def func(x): - return paddle.sum(paddle.matmul(x, x)) - - numerical_func_output = func(self.x).numpy() - numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx, - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx) - np.testing.assert_allclose(func_output.numpy(), numerical_func_output, - self.rtol, self.atol) - np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, - self.atol) - - def func_multi_input(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, y)) - - numerical_func_output = func(self.x, self.y).numpy() - numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], - [self.vx, self.vy], - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y], - [self.vx, self.vy]) - np.testing.assert_allclose(func_output.numpy(), numerical_func_output, - self.rtol, self.atol) - for i in range(len(vhp)): - np.testing.assert_allclose(vhp[i].numpy(), numerical_vhp[i], - self.rtol, self.atol) - - def func_v_default(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, y)) - - numerical_func_output = func(self.x, self.y).numpy() - vx = paddle.ones(self.vx.shape, dtype=self.vx.dtype) - vy = paddle.ones(self.vy.shape, dtype=self.vy.dtype) - numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], [vx, vy], - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y]) - np.testing.assert_allclose(func_output.numpy(), numerical_func_output, - self.rtol, self.atol) - for i in range(len(vhp)): - np.testing.assert_allclose(vhp[i].numpy(), numerical_vhp[i], - self.rtol, self.atol) - - def func_allow_unused_true(self): - - def func(x, y): - return paddle.sum(paddle.matmul(x, x)) - - numerical_func_output = func(self.x, self.y).numpy() - numerical_vhp = _compute_numerical_vhp(func, [self.x, self.y], - [self.vx, self.vy], - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - func_output, vhp = paddle.autograd.vhp(func, [self.x, self.y], - [self.vx, self.vy]) - np.testing.assert_allclose(func_output.numpy(), numerical_func_output, - self.rtol, self.atol) - np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, - self.atol) - - def func_create_graph_true(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - - def func(x): - return paddle.sum(F.sigmoid(x)) - - numerical_func_output = func(self.x).numpy() - numerical_vhp = _compute_numerical_vhp(func, self.x, self.vx, - self.numerical_delta, - self.np_dtype) - - self.x.stop_gradient = False - func_output, vhp = paddle.autograd.vhp(func, self.x, self.vx) - np.testing.assert_allclose(func_output.numpy(), numerical_func_output, - self.rtol, self.atol) - assert vhp[0].stop_gradient == False - np.testing.assert_allclose(vhp[0].numpy(), numerical_vhp[0], self.rtol, - self.atol) - triple_grad = paddle.grad(vhp, self.x) - assert triple_grad is not None - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) - - def test_all_cases(self): - with _test_eager_guard(): - self.setUpClass() - self.func_v_default() - self.func_multi_input() - self.func_single_input() - self.func_allow_unused_true() - self.func_create_graph_true() - self.setUpClass() - self.func_v_default() - self.func_multi_input() - self.func_single_input() - self.func_allow_unused_true() - self.func_create_graph_true() - - -class TestJacobian(unittest.TestCase): - - @classmethod - def setUpClass(self): - self.shape = (4, 4) - self.dtype = 'float32' - self.np_dtype = np.float32 - self.numerical_delta = 1e-4 - self.rtol = 1e-3 - self.atol = 1e-3 - self.x = paddle.rand(shape=self.shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - - def func_single_input_and_single_output(self): - - def func(x): - return paddle.matmul(x, x) - - numerical_jacobian = _compute_numerical_jacobian( - func, self.x, self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, self.x) - np.testing.assert_allclose(jacobian.numpy(), numerical_jacobian[0][0], - self.rtol, self.atol) - - def func_single_input_and_multi_output(self): - - def func(x): - return paddle.matmul(x, x), x * x - - numerical_jacobian = _compute_numerical_jacobian( - func, self.x, self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, self.x) - for i in range(len(jacobian)): - np.testing.assert_allclose(jacobian[i].numpy(), - numerical_jacobian[i][0], self.rtol, - self.atol) - - def func_multi_input_and_single_output(self): - - def func(x, y): - return paddle.matmul(x, y) - - numerical_jacobian = _compute_numerical_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) - for j in range(len(jacobian)): - np.testing.assert_allclose(jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - - def func_multi_input_and_multi_output(self): - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - - def func(x, y): - return paddle.matmul(x, y), x * y - - numerical_jacobian = _compute_numerical_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) - for i in range(len(jacobian)): - for j in range(len(jacobian[0])): - np.testing.assert_allclose(jacobian[i][j].numpy(), - numerical_jacobian[i][j], self.rtol, - self.atol) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) - - def func_allow_unused_false(self): - - def func(x, y): - return paddle.matmul(x, x) - - try: - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) - except ValueError as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("allow_unused") > 0 - - def func_allow_unused_true(self): - - def func(x, y): - return paddle.matmul(x, x) - - numerical_jacobian = _compute_numerical_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y], - allow_unused=True) - np.testing.assert_allclose(jacobian[0].numpy(), - numerical_jacobian[0][0], self.rtol, - self.atol) - assert jacobian[1] is None - - def func_create_graph_false(self): - - def func(x, y): - return paddle.matmul(x, y) - - numerical_jacobian = _compute_numerical_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) - for j in range(len(jacobian)): - assert jacobian[j].stop_gradient == True - np.testing.assert_allclose(jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - try: - paddle.grad(jacobian[0], [self.x, self.y]) - except Exception as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 or error_msg.find( - "does not appear") > 0 - - def func_create_graph_true(self): - - def func(x, y): - return paddle.matmul(x, y) - - numerical_jacobian = _compute_numerical_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.jacobian(func, [self.x, self.y], - create_graph=True) - for j in range(len(jacobian)): - assert jacobian[j].stop_gradient == False - np.testing.assert_allclose(jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - double_grad = paddle.grad(jacobian[0], [self.x, self.y]) - assert double_grad is not None - - def test_all_cases(self): - with _test_eager_guard(): - self.setUpClass() - self.func_multi_input_and_multi_output() - self.func_multi_input_and_single_output() - self.func_single_input_and_multi_output() - self.func_single_input_and_single_output() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - self.setUpClass() - self.func_multi_input_and_multi_output() - self.func_multi_input_and_single_output() - self.func_single_input_and_multi_output() - self.func_single_input_and_single_output() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - - -class TestJacobianFloat64(TestJacobian): - - @classmethod - def setUpClass(self): - self.shape = (4, 4) - self.dtype = 'float64' - self.np_dtype = np.float64 - self.numerical_delta = 1e-7 - self.rtol = 1e-7 - self.atol = 1e-7 - self.x = paddle.rand(shape=self.shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.shape, dtype=self.dtype) - - -class TestJacobianBatch(unittest.TestCase): - - @classmethod - def setUpClass(self): - self.x_shape = (4, 2) - self.weight_shape = (2, 4) - self.y_shape = (4, 2) - self.dtype = 'float32' - self.np_dtype = np.float32 - self.numerical_delta = 1e-4 - self.rtol = 1e-3 - self.atol = 1e-3 - self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype) - self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype) - - def func_batch_single_input_and_batch_single_output(self): - - def func(x): - return paddle.matmul(paddle.matmul(x, self.weight), self.y) - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x], self.numerical_delta, self.np_dtype) - - self.x.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian( - func, - self.x, - ) - - self.assertTrue( - np.allclose(batch_jacobian.numpy().all(), - numerical_jacobian[0][0].all())) - - def func_batch_single_input_and_batch_multi_output(self): - - def func(x): - return paddle.matmul(paddle.matmul(x, self.weight), self.y), x * x - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x], self.numerical_delta, self.np_dtype) - - self.x.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian( - func, - self.x, - ) - - for i in range(len(batch_jacobian)): - np.testing.assert_allclose(batch_jacobian[i].numpy(), - numerical_jacobian[i][0], self.rtol, - self.atol) - - def func_batch_multi_input_and_batch_single_output(self): - - def func(x, y): - return x * y - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y]) - - for j in range(len(batch_jacobian)): - np.testing.assert_allclose(batch_jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - - def func_batch_multi_input_and_batch_multi_output(self): - - def func(x, y): - return x * y, x * y - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - - self.x.stop_gradient = False - self.y.stop_gradient = False - batch_jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y]) - - for i in range(len(batch_jacobian)): - np.testing.assert_allclose(batch_jacobian[i], numerical_jacobian[i], - self.rtol, self.atol) - - def func_allow_unused_false(self): - - def func(x, y): - return x * x - - try: - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y]) - except ValueError as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("allow_unused") > 0 - - def func_allow_unused_true(self): - - def func(x, y): - return x * x - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y], - allow_unused=True) - - np.testing.assert_allclose(jacobian[0].numpy(), - numerical_jacobian[0][0], self.rtol, - self.atol) - assert jacobian[1] is None - - def func_create_graph_false(self): - - def func(x, y): - return x * y - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y]) - for j in range(len(jacobian)): - assert jacobian[j].stop_gradient == True - np.testing.assert_allclose(jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - try: - paddle.grad(jacobian[0], [self.x, self.y]) - except Exception as e: - error_msg = cpt.get_exception_message(e) - assert error_msg.find("has no gradient") > 0 or error_msg.find( - "does not appear") > 0 - - def func_create_graph_true(self): - - def func(x, y): - return x * y - - numerical_jacobian = _compute_numerical_batch_jacobian( - func, [self.x, self.y], self.numerical_delta, self.np_dtype) - self.x.stop_gradient = False - self.y.stop_gradient = False - jacobian = paddle.autograd.batch_jacobian(func, [self.x, self.y], - create_graph=True) - for j in range(len(jacobian)): - assert jacobian[j].stop_gradient == False - np.testing.assert_allclose(jacobian[j].numpy(), - numerical_jacobian[0][j], self.rtol, - self.atol) - double_grad = paddle.grad(jacobian[0], [self.x, self.y]) - assert double_grad is not None - - def test_all_cases(self): - with _test_eager_guard(): - self.setUpClass() - self.func_batch_single_input_and_batch_single_output() - self.func_batch_single_input_and_batch_multi_output() - self.func_batch_multi_input_and_batch_single_output() - self.func_batch_multi_input_and_batch_multi_output() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - self.setUpClass() - self.func_batch_single_input_and_batch_single_output() - self.func_batch_single_input_and_batch_multi_output() - self.func_batch_multi_input_and_batch_single_output() - self.func_batch_multi_input_and_batch_multi_output() - self.func_allow_unused_false() - self.func_allow_unused_true() - self.func_create_graph_false() - self.func_create_graph_true() - - -class TestJacobianBatchFloat64(TestJacobianBatch): - - @classmethod - def setUpClass(self): - self.x_shape = (12, 2) - self.weight_shape = (2, 12) - self.y_shape = (12, 2) - self.dtype = 'float64' - self.np_dtype = np.float64 - self.numerical_delta = config.TOLERANCE.get( - self.dtype).get('second_order_grad').get('eps') - self.rtol = config.TOLERANCE.get( - self.dtype).get('second_order_grad').get('rtol') - self.atol = config.TOLERANCE.get( - self.dtype).get('second_order_grad').get('atol') - self.x = paddle.rand(shape=self.x_shape, dtype=self.dtype) - self.weight = paddle.rand(shape=self.weight_shape, dtype=self.dtype) - self.y = paddle.rand(shape=self.y_shape, dtype=self.dtype) - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py index f75460df6b52d..d17420c904546 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_prim.py @@ -145,5 +145,130 @@ def wrapper(fun, args): atol=self._atol) +@utils.place(config.DEVICES) +@utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'args', 'dtype'), ( + ('unary_float32', paddle.tanh, (np.random.rand(2, 3), ), 'float32'), + ('binary_float32', paddle.matmul, + (np.random.rand(2, 3), np.random.rand(3, 2)), 'float32'), + ('unary_float64', paddle.tanh, (np.random.rand(2, 3), ), 'float64'), + ('binary_float64', paddle.matmul, + (np.random.rand(2, 3), np.random.rand(3, 2)), 'float64'), +)) +class TestJvpPrim(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.args = [arg.astype(cls.dtype) for arg in cls.args] + cls._rtol = config.TOLERANCE.get( + cls.dtype).get('first_order_grad').get('rtol') + cls._atol = config.TOLERANCE.get( + cls.dtype).get('first_order_grad').get('atol') + + def setUp(self): + paddle.enable_static() + paddle.incubate.autograd.enable_prim() + + def tearDown(self): + paddle.incubate.autograd.disable_prim() + paddle.disable_static() + + def test_jacobian_prim(self): + + def wrapper(fun, args): + mp = paddle.static.Program() + sp = paddle.static.Program() + with paddle.static.program_guard(mp, sp): + static_args = [ + paddle.static.data(f'arg{i}', arg.shape, self.dtype) + for i, arg in enumerate(args) + ] + for arg in static_args: + arg.stop_gradient = False + _, jvp_res = paddle.incubate.autograd.jvp(fun, static_args) + if paddle.incubate.autograd.prim_enabled(): + paddle.incubate.autograd.prim2orig() + exe = paddle.static.Executor() + exe.run(sp) + jvp_res = exe.run( + mp, + feed={f'arg{i}': arg + for i, arg in enumerate(args)}, + fetch_list=[jvp_res]) + return jvp_res + + paddle.incubate.autograd.enable_prim() + prim_jvp = wrapper(self.fun, self.args) + paddle.incubate.autograd.disable_prim() + orig_jvp = wrapper(self.fun, self.args) + + np.testing.assert_allclose(orig_jvp, + prim_jvp, + rtol=self._rtol, + atol=self._atol) + + +@utils.place(config.DEVICES) +@utils.parameterize((utils.TEST_CASE_NAME, 'fun', 'args', 'dtype'), ( + ('unary_float32', paddle.tanh, (np.random.rand(2, 3), ), 'float32'), + ('binary_float32', paddle.matmul, + (np.random.rand(2, 3), np.random.rand(3, 2)), 'float32'), + ('unary_float64', paddle.tanh, (np.random.rand(2, 3), ), 'float64'), + ('binary_float64', paddle.matmul, + (np.random.rand(2, 3), np.random.rand(3, 2)), 'float64'), +)) +class TestVjpPrim(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.args = [arg.astype(cls.dtype) for arg in cls.args] + cls._rtol = config.TOLERANCE.get( + cls.dtype).get('first_order_grad').get('rtol') + cls._atol = config.TOLERANCE.get( + cls.dtype).get('first_order_grad').get('atol') + + def setUp(self): + paddle.enable_static() + paddle.incubate.autograd.enable_prim() + + def tearDown(self): + paddle.incubate.autograd.disable_prim() + paddle.disable_static() + + def test_jacobian_prim(self): + + def wrapper(fun, args): + mp = paddle.static.Program() + sp = paddle.static.Program() + with paddle.static.program_guard(mp, sp): + static_args = [ + paddle.static.data(f'arg{i}', arg.shape, self.dtype) + for i, arg in enumerate(args) + ] + for arg in static_args: + arg.stop_gradient = False + _, vjp_res = paddle.incubate.autograd.vjp(fun, static_args) + if paddle.incubate.autograd.prim_enabled(): + paddle.incubate.autograd.prim2orig() + exe = paddle.static.Executor() + exe.run(sp) + vjp_res = exe.run( + mp, + feed={f'arg{i}': arg + for i, arg in enumerate(args)}, + fetch_list=[vjp_res]) + return vjp_res + + paddle.incubate.autograd.enable_prim() + prim_vjp = wrapper(self.fun, self.args) + paddle.incubate.autograd.disable_prim() + orig_vjp = wrapper(self.fun, self.args) + + for orig, prim in zip(orig_vjp, prim_vjp): + np.testing.assert_allclose(orig, + prim, + rtol=self._rtol, + atol=self._atol) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py index 4e01ad5382c91..9b2098d37b882 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_static.py @@ -59,7 +59,8 @@ def _vjp(self): with paddle.static.program_guard(mp, sp): feed, static_xs, static_v = utils.gen_static_data_and_feed( self.xs, self.v, stop_gradient=self.stop_gradient) - ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v) + ys, xs_grads = paddle.incubate.autograd.vjp(self.fun, static_xs, + static_v) exe.run(sp) return exe.run(mp, feed=feed, fetch_list=[ys, xs_grads]) @@ -103,7 +104,8 @@ def _vjp(self): with paddle.static.program_guard(mp, sp): feed, static_xs, static_v = utils.gen_static_data_and_feed( self.xs, self.v) - ys, xs_grads = paddle.autograd.vjp(self.fun, static_xs, static_v) + ys, xs_grads = paddle.incubate.autograd.vjp(self.fun, static_xs, + static_v) self.exe.run(sp) return self.exe.run(mp, feed, fetch_list=[ys, xs_grads]) @@ -214,7 +216,7 @@ def run_test_by_fullmatrix(self, pd_f, np_f, inps, batch=False): startup = fluid.Program() with fluid.program_guard(main, startup): xs = make_tensors(inps) - JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch) + JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch) if batch: _, nrow, ncol = JJ.shape else: @@ -244,7 +246,7 @@ def run_test_by_rows(self, pd_f, np_f, inps, batch=False): startup = fluid.Program() with fluid.program_guard(main, startup): xs = make_tensors(inps) - JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch) + JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch) if batch: nbatch, nrow, ncol = JJ.shape rows = [JJ[:, i, :] for i in range(nrow)] @@ -269,7 +271,7 @@ def run_test_by_entries(self, pd_f, np_f, inps, batch=False): startup = fluid.Program() with fluid.program_guard(main, startup): xs = make_tensors(inps) - JJ = paddle.autograd.functional.Jacobian(pd_f, xs, is_batched=batch) + JJ = paddle.incubate.autograd.Jacobian(pd_f, xs, is_batched=batch) if batch: nbatch, nrow, ncol = JJ.shape entries = [ @@ -390,7 +392,7 @@ def run_test_by_fullmatrix(self, pd_f, inps, np_hess, batch=False): startup = fluid.Program() with fluid.program_guard(main, startup): xs = make_tensors(inps) - HH = paddle.autograd.functional.Hessian(pd_f, xs, is_batched=batch) + HH = paddle.incubate.autograd.Hessian(pd_f, xs, is_batched=batch) nrow, ncol = HH.shape full_hessian = HH[:] exe = fluid.Executor(self.place) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py b/python/paddle/fluid/tests/unittests/autograd/test_minimize.py similarity index 56% rename from python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py rename to python/paddle/fluid/tests/unittests/autograd/test_minimize.py index 67ebe01d9f027..10259802c6933 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_gradients_and_minimize.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_minimize.py @@ -13,82 +13,16 @@ # limitations under the License. import unittest -import numpy as np +import numpy as np import paddle from paddle.incubate.autograd.primx import prim2orig -from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled +from paddle.incubate.autograd.utils import (disable_prim, enable_prim, + prim_enabled) paddle.enable_static() -class TestGradients(unittest.TestCase): - - def test_third_order(self): - enable_prim() - main = paddle.static.Program() - startup = paddle.static.Program() - with paddle.static.program_guard(main, startup): - x = paddle.static.data(name='x', shape=[1], dtype='float32') - x2 = paddle.multiply(x, x) - x3 = paddle.multiply(x2, x) - x4 = paddle.multiply(x3, x) - - grad1, = paddle.static.gradients([x4], [x]) - grad2, = paddle.static.gradients([grad1], [x]) - grad3, = paddle.static.gradients([grad2], [x]) - - prim2orig(main.block(0)) - - feed = {x.name: np.array([2.]).astype('float32')} - fetch_list = [grad3.name] - result = [np.array([48.])] - - place = paddle.CPUPlace() - if paddle.device.is_compiled_with_cuda(): - place = paddle.CUDAPlace(0) - exe = paddle.static.Executor(place) - exe.run(startup) - outs = exe.run(main, feed=feed, fetch_list=fetch_list) - np.allclose(outs, result) - disable_prim() - - def test_fourth_order(self): - enable_prim() - main = paddle.static.Program() - startup = paddle.static.Program() - with paddle.static.program_guard(main, startup): - x = paddle.static.data(name='x', shape=[1], dtype='float32') - x2 = paddle.multiply(x, x) - x3 = paddle.multiply(x2, x) - x4 = paddle.multiply(x3, x) - x5 = paddle.multiply(x4, x) - out = paddle.sqrt(x5 + x4) - - grad1, = paddle.static.gradients([out], [x]) - grad2, = paddle.static.gradients([grad1], [x]) - grad3, = paddle.static.gradients([grad2], [x]) - grad4, = paddle.static.gradients([grad3], [x]) - - prim2orig(main.block(0)) - - feed = { - x.name: np.array([2.]).astype('float32'), - } - fetch_list = [grad4.name] - # (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5) - result = [np.array([-0.27263762711])] - - place = paddle.CPUPlace() - if paddle.device.is_compiled_with_cuda(): - place = paddle.CUDAPlace(0) - exe = paddle.static.Executor(place) - exe.run(startup) - outs = exe.run(main, feed=feed, fetch_list=fetch_list) - np.allclose(outs, result) - disable_prim() - - class TestMinimize(unittest.TestCase): def model(self, x, w, bias, opt): diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py index 0137f4103fbb3..dc52c5bc33b48 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py @@ -37,7 +37,7 @@ ('input_gradients_not_none', paddle.matmul, (np.random.rand(3, 3), np.random.rand(3, 3)), (np.random.rand(3, 3), np.random.rand(3, 3)), 'float64'))) -class TestForwardGradients(unittest.TestCase): +class TestForwardGrad(unittest.TestCase): @classmethod def setUpClass(cls): @@ -55,7 +55,7 @@ def tearDown(self): paddle.incubate.autograd.disable_prim() paddle.disable_static() - def test_forward_gradients(self): + def test_forward_grad(self): def expected(): paddle.incubate.autograd.disable_prim() @@ -64,7 +64,8 @@ def expected(): with paddle.static.program_guard(mp, sp): feed, static_xs, static_v = utils.gen_static_data_and_feed( self.xs, self.v, stop_gradient=False) - _, ys_grad = paddle.autograd.jvp(self.fun, static_xs, static_v) + _, ys_grad = paddle.incubate.autograd.jvp( + self.fun, static_xs, static_v) exe = paddle.static.Executor() exe.run(sp) out = exe.run(mp, feed=feed, fetch_list=ys_grad) @@ -80,7 +81,8 @@ def actual(): self.xs, self.v, stop_gradient=False) ys = self.fun(*static_xs) if isinstance( static_xs, typing.Sequence) else self.fun(static_xs) - ys_grad = primapi.forward_gradients(ys, static_xs, static_v) + ys_grad = paddle.incubate.autograd.forward_grad( + ys, static_xs, static_v) paddle.incubate.autograd.prim2orig(mp.block(0)) exe = paddle.static.Executor() exe.run(sp) @@ -106,7 +108,7 @@ def test_prim_disabled(self): self.xs, self.v, stop_gradient=False) ys = self.fun(*static_xs) if isinstance( static_xs, typing.Sequence) else self.fun(static_xs) - ys_grad = primapi.forward_gradients(ys, static_xs, static_v) + ys_grad = primapi.forward_grad(ys, static_xs, static_v) paddle.incubate.autograd.prim2orig(mp.block(0)) exe = paddle.static.Executor() exe.run(sp) @@ -116,14 +118,125 @@ def test_prim_disabled(self): def test_illegal_param(self): paddle.incubate.autograd.enable_prim() with self.assertRaises(TypeError): - primapi.forward_gradients(1, paddle.static.data('inputs', - shape=[1])) + primapi.forward_grad(1, paddle.static.data('inputs', shape=[1])) with self.assertRaises(TypeError): - primapi.forward_gradients(paddle.static.data('targets', shape=[1]), - 1) + primapi.forward_grad(paddle.static.data('targets', shape=[1]), 1) paddle.incubate.autograd.disable_prim() +class TestGrad(unittest.TestCase): + + def setUp(self): + paddle.enable_static() + paddle.incubate.autograd.enable_prim() + + def tearDown(self): + paddle.incubate.autograd.disable_prim() + paddle.disable_static() + + def test_third_order(self): + paddle.incubate.autograd.enable_prim() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + x = paddle.static.data(name='x', shape=[1], dtype='float32') + x2 = paddle.multiply(x, x) + x3 = paddle.multiply(x2, x) + x4 = paddle.multiply(x3, x) + + grad1, = paddle.incubate.autograd.grad([x4], [x]) + grad2, = paddle.incubate.autograd.grad([grad1], [x]) + grad3, = paddle.incubate.autograd.grad([grad2], [x]) + + paddle.incubate.autograd.prim2orig(main.block(0)) + + feed = {x.name: np.array([2.]).astype('float32')} + fetch_list = [grad3.name] + result = [np.array([48.])] + + place = paddle.CPUPlace() + if paddle.device.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + exe.run(startup) + outs = exe.run(main, feed=feed, fetch_list=fetch_list) + np.allclose(outs, result) + paddle.incubate.autograd.disable_prim() + + def test_fourth_order(self): + paddle.incubate.autograd.enable_prim() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + x = paddle.static.data(name='x', shape=[1], dtype='float32') + x2 = paddle.multiply(x, x) + x3 = paddle.multiply(x2, x) + x4 = paddle.multiply(x3, x) + x5 = paddle.multiply(x4, x) + out = paddle.sqrt(x5 + x4) + + grad1, = paddle.incubate.autograd.grad([out], [x]) + grad2, = paddle.incubate.autograd.grad([grad1], [x]) + grad3, = paddle.incubate.autograd.grad([grad2], [x]) + grad4, = paddle.incubate.autograd.grad([grad3], [x]) + + paddle.incubate.autograd.prim2orig(main.block(0)) + + feed = { + x.name: np.array([2.]).astype('float32'), + } + fetch_list = [grad4.name] + # (3*(-5*x^2-16*x-16))/(16*(x+1)^3.5) + result = [np.array([-0.27263762711])] + + place = paddle.CPUPlace() + if paddle.device.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + exe.run(startup) + outs = exe.run(main, feed=feed, fetch_list=fetch_list) + np.allclose(outs, result) + paddle.incubate.autograd.disable_prim() + + def test_disable_prim(self): + + def actual(x: np.array): + paddle.incubate.autograd.disable_prim() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + var_x = paddle.static.data('x', shape=x.shape, dtype=x.dtype) + var_x.stop_gradient = False + y = paddle.tanh(var_x) + y_grad = paddle.incubate.autograd.grad(y, var_x) + y_second_grad = paddle.incubate.autograd.grad(y_grad, var_x) + exe = paddle.static.Executor() + exe.run(startup) + return exe.run(main, + feed={'x': x}, + fetch_list=[y_grad, y_second_grad]) + + def expect(x: np.array): + paddle.incubate.autograd.disable_prim() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + var_x = paddle.static.data('x', shape=x.shape, dtype=x.dtype) + var_x.stop_gradient = False + y = paddle.tanh(var_x) + y_grad = paddle.static.gradients(y, var_x) + y_second_grad = paddle.static.gradients(y_grad, var_x) + exe = paddle.static.Executor() + exe.run(startup) + return exe.run(main, + feed={'x': x}, + fetch_list=[y_grad, y_second_grad]) + + x = np.random.randn(100, 200) + for i, j in zip(actual(x), expect(x)): + np.testing.assert_allclose(i, j) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primops.py b/python/paddle/fluid/tests/unittests/autograd/test_primops.py index ccbd630bfd084..f14664237f36f 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_primops.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_primops.py @@ -21,7 +21,7 @@ concat, reduce, matmul, slice_select, slice_assign, gather, scatter_add, fill_const) -from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig, _gradients +from paddle.incubate.autograd.primx import Transform, topo_path, orig2prim, prim2orig from paddle.incubate.autograd.utils import enable_prim, disable_prim, prim_enabled diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py index 8a0e51f60f47b..6afd0ff392288 100644 --- a/python/paddle/fluid/tests/unittests/autograd/utils.py +++ b/python/paddle/fluid/tests/unittests/autograd/utils.py @@ -22,7 +22,7 @@ import collections import numpy as np import paddle -from paddle.autograd.utils import as_tensors +from paddle.incubate.autograd.utils import as_tensors ########################################################## diff --git a/python/paddle/incubate/autograd/__init__.py b/python/paddle/incubate/autograd/__init__.py index 718bc018d9fe5..c5ff3b18d4d49 100644 --- a/python/paddle/incubate/autograd/__init__.py +++ b/python/paddle/incubate/autograd/__init__.py @@ -11,11 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from paddle.autograd.functional import Hessian, Jacobian, jvp, vjp +from .functional import Hessian, Jacobian, jvp, vjp +from .primapi import forward_grad, grad from .primx import prim2orig -from .utils import enable_prim, disable_prim, prim_enabled +from .utils import disable_prim, enable_prim, prim_enabled __all__ = [ # noqa - 'vjp', 'jvp', 'Jacobian', 'Hessian', 'prim2orig', 'enable_prim', - 'disable_prim', 'prim_enabled' + 'vjp', 'jvp', 'Jacobian', 'Hessian', 'enable_prim', 'disable_prim', + 'forward_grad', 'grad' ] diff --git a/python/paddle/incubate/autograd/functional.py b/python/paddle/incubate/autograd/functional.py new file mode 100644 index 0000000000000..6c740005f8253 --- /dev/null +++ b/python/paddle/incubate/autograd/functional.py @@ -0,0 +1,675 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import typing + +import paddle +from paddle.fluid import framework +from paddle.incubate.autograd import primapi, utils + + +def vjp(func, xs, v=None): + r"""Computes the Vector-Jacobian product, a functional form of + reverse mode automatic differentiation. + + Warning: + This API is in beta, the signatures could be changed in future version. + + Args: + func(Callable): A function that takes ``xs`` as inputs parameter and + returns a sequence of Tensors or a Tensor. + xs(Tensor|Sequence[Tensor]): Used as positional arguments to evaluate + ``func``. ``xs`` is accepted as one Tensor or a sequence of Tensors. + v(Tensor|Sequence[Tensor]|None, optional): The cotangent vector invovled + in the VJP computation. ``v`` matches the size and shape of + ``func`` 's output. Defaults to None, which is equivalent to all + ones the same size of ``func`` 's output. + + Returns: + output(tuple): + + - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` . + - vjp(Tensor|tuple[Tensor]): The vjp result. + + Examples: + + .. code-block:: python + + import paddle + + def func(x): + return paddle.matmul(x, x) + + x = paddle.ones(shape=[2, 2], dtype='float32') + _, vjp_result = paddle.incubate.autograd.vjp(func, x) + print(vjp_result) + # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[4., 4.], + # [4., 4.]]) + + v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) + _, vjp_result = paddle.incubate.autograd.vjp(func, x, v) + print(vjp_result) + # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[2., 1.], + # [1., 0.]]) + """ + _check_inputs(func, xs, v) + + # ``_seprate`` breaks the dependencies between ``xs`` and other + # variables. See more ``_seprate`` . + if paddle.fluid._non_static_mode() or not utils.prim_enabled(): + xs, v = _separate(xs), _separate(v) + ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs) + _check_v_shape(v, ys) + + return ys, _grad(ys, xs, v) + + +def jvp(func, xs, v=None): + r""" + Computes the Jacobian-Vector product for a function at the given + inputs and a vector in the tangent space induced by the inputs. + + Warning: + This API is in beta, the signatures could be changed in future version. + + Args: + func(Callable): The ``func`` takes as input a Tensor or a Sequence + of Tensors and returns a Tensor or a Sequence of Tensors. + xs(Tensor|Sequence[Tensor]): Used as positional arguments to + evaluate ``func``. The ``xs`` is accepted as one Tensor or a + Sequence of Tensors. + v(Tensor|Sequence[Tensor]|None, Optional): The tangent vector invovled + in the JVP computation. The ``v`` matches the size and shape of + ``xs`` . Default value is None and in this case is equivalent to + all ones the same size of ``xs`` . + + Returns: + output(tuple): + + - func_out(Tensor|tuple[Tensor]): The output of ``func(xs)`` . + - jvp(Tensor|tuple[Tensor]): The jvp result. + + Examples: + + .. code-block:: python + + import paddle + + + def func(x): + return paddle.matmul(x, x) + + + x = paddle.ones(shape=[2, 2], dtype='float32') + _, jvp_result = paddle.incubate.autograd.jvp(func, x) + print(jvp_result) + # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[4., 4.], + # [4., 4.]]) + v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) + _, jvp_result = paddle.incubate.autograd.jvp(func, x, v) + print(jvp_result) + # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[2., 1.], + # [1., 0.]]) + + """ + _check_inputs(func, xs, v) + # ``_seprate`` breaks the dependencies between ``xs`` and other + # variables. See more ``_seprate`` . + if paddle.fluid._non_static_mode() or not utils.prim_enabled(): + xs, v = _separate(xs), _separate(v) + ys = func(*xs) if isinstance(xs, typing.Sequence) else func(xs) + _check_v_shape(v, xs) + + if not paddle.fluid._non_static_mode() and utils.prim_enabled(): + return ys, primapi.forward_grad(ys, xs, v) + else: + return ys, _double_backward_trick(ys, xs, v) + + +def _double_backward_trick(ys, xs, v): + """Double backward trick for computing ``jvp`` by ``vjp`` + see details: https://j-towns.github.io/2017/06/12/A-new-trick.html + """ + # The value of ys_grad is not important, it can be any random value in + # theory, but it's required to set stop_gradient=False. + ys_grad = _zeros_like_with_grad(ys) + xs_grad = _grad(ys, xs, ys_grad) + return _grad(xs_grad, ys_grad, v) + + +def _zeros_like_with_grad(xs): + """Create a zero or zeros sequence Tensor like ``xs`` with a flag + ``stop_graident=False`` . + """ + if not isinstance(xs, typing.Sequence): + ys = paddle.zeros_like(xs) + ys.stop_gradient = False + else: + ys = [] + for x in xs: + y = paddle.zeros_like(x) + y.stop_gradient = False + ys.append(y) + return ys + + +class Jacobian(object): + r""" + Computes the Jacobian matrix of a given function. + + If the function has multiple inputs and multiple outputs, during internal + implementation, all input tensors are concatenated after being flatten, + the batch dimension is retained, and the output is subject to the same + processing rules. + + Once the Jacobian ``J`` is constructed, you can use a multidimensional index + to retrieve the submatrix of ``J``, as same as slicing a Tensor. The + submatrix is lazily evaluated along row axis, and will be cached once + evaluated. + + For examples, supposing ``is_batched=True``, you can retrieve the submatrix + by following methods: + + * J[:], retrieving the full matrix. + * J[:, :, j], retrieving the partial derivatives w.r.t. the j'th input + variable. + * J[:, i, :], retrieving the partial derivatives w.r.t. the i'th output + variable. + * J[:, i, j], retrieving the partial derivatives w.r.t. the i'th output + variable and the j'th input variable. + + Notes: + + Eclipsis index is not supported currently. + + Warning: + This API is in beta, the signatures could be changed in future version. + + Args: + + func (Callable): A python function that takes a Tensor or a sequence of + Tensors as inputs(the first dimension is batch size) and + returns a Tensor a sequence of Tensors. + xs (Tensor|Sequence[Tensor]): The input to the function ``func`` . + is_batched (bool): If true, the first axis is batch axis. Defaults to + False. + + Returns: + + Jacobian (Object): A python object retains the Jacobian matrix. + + Examples: + + .. code-block:: python + + import paddle + + + def func(x, y): + return paddle.matmul(x, y) + + + x = paddle.to_tensor([[1., 2.], [3., 4.]]) + J = paddle.incubate.autograd.Jacobian(func, [x, x]) + print(J[:, :]) + # Tensor(shape=[4, 8], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[1., 3., 0., 0., 1., 0., 2., 0.], + # [2., 4., 0., 0., 0., 1., 0., 2.], + # [0., 0., 1., 3., 3., 0., 4., 0.], + # [0., 0., 2., 4., 0., 3., 0., 4.]]) + + print(J[0, :]) + # Tensor(shape=[8], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [1., 3., 0., 0., 1., 0., 2., 0.]) + print(J[:, 0]) + # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [1., 2., 0., 0.]) + + """ + + def __init__(self, func, xs, is_batched=False): + if not is_batched: + self._jacobian = _JacobianNoBatch(func, xs) + else: + self._jacobian = _JacobianBatchFirst(func, xs) + + def __getitem__(self, indexes): + return self._jacobian[indexes] + + @property + def shape(self): + """The shape of flattened Jacobian matrix. + """ + return self._jacobian.shape + + +class Hessian(object): + """ + Computes the Hessian matrix with a given ``func`` with respect to ``xs`` . + + If the function has multiple inputs, during internal implementation, + all input tensors are concatenated after being flatten, the batch dimension + is retained. + + The Hessian submatrix is lazily evaluated, and can be retrieved with a + multidimensional indexes. See details ``Jacobian`` . + + Warning: + This API is in beta, the signatures could be changed in future version. + + Args: + func (Callable): A python function that takes a Tensor or a Tensor + sequence as inputs and returns a Tensor with shape + ``[batch_size, 1]`` with batch or ``[1]`` without batch. + xs (Tensor|Sequence(Tensor)): The input Tensor or Tensor sequence of + the function ``func``. + is_batched (bool): If true, the first axis is batch axis. Defaults to + False. + + Returns: + + Hessian (Object): A python object retains the Hessian matrix. + + + Examples: + + .. code-block:: python + + import paddle + + + def reducer(x): + return paddle.sum(x * x) + + + x = paddle.rand([2, 2]) + h = paddle.incubate.autograd.Hessian(reducer, x) + print(h[:]) + # Tensor(shape=[4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=False, + # [[2., 0., 0., 0.], + # [0., 2., 0., 0.], + # [0., 0., 2., 0.], + # [0., 0., 0., 2.]]) + """ + + def __init__(self, func, xs, is_batched=False): + + def _jac_func(*xs): + jac = Jacobian(func, xs, is_batched=is_batched) + if (is_batched and jac.shape[1] != 1) or (not is_batched + and jac.shape[0] != 1): + raise RuntimeError( + "The function given to Hessian shoud return as single element Tensor or batched single element Tensor." + ) + return jac[:, 0, :] if is_batched else jac[0, :] + + self.symbolic = Jacobian(_jac_func, xs, is_batched=is_batched) + + def __getitem__(self, indexes): + return self.symbolic[indexes] + + @property + def shape(self): + """The shape of flattened Hessian matrix. + """ + return self.symbolic.shape + + +class _Jacobian(object): + """The base class for computing Jacobian matrix. + + ``_Jacobian`` implementes the core logic of multidimensional index and lazy + evaluation for Jacobian matrix, subclass only need to overwrite following + methods: + + * ``_lazy_axis()``, return the axis along which will be lazy + evaluating. + * ``_flatten(xs)``, flattens the inputs ``xs``. + * ``_evaluate(index)``, evaluates one slice along ``_lazy_axis`` . + + Notes: + + Because currently PaddlePaddle only support reverse differentiation by + ``paddle.grad``, so lazy evaluation is only supported along the row of + Jacobian matrix, which means that slicing along row will get better + performance. + + """ + + def __init__(self, func, xs): + # Skip separating in prim mode temporarily, as detach and clone are not + # primitive operators. + if not paddle.fluid._non_static_mode() and utils.prim_enabled(): + self._xs = xs + else: + self._xs = _separate(xs) + self._ys = func(*utils.as_tensors(self._xs)) + self._flatten_xs = self._flatten(utils.as_tensors(self._xs)) + self._flatten_ys = self._flatten(utils.as_tensors(self._ys)) + self._cache = {} + + @property + def shape(self): + raise NotImplementedError + + @property + def _lazy_axis(self): + """"The axis of lazily evaluated.""" + raise NotImplementedError + + def _lazy_indexes(self, indexes): + idx = indexes[self._lazy_axis] + return (idx, ) if isinstance(idx, int) else tuple( + range(idx.start, idx.stop, idx.step)) + + def _flatten(self, xs): + raise NotImplementedError + + def _shifted_indexes(self, indexes, lazy_axis_size=0): + idx = indexes[self._lazy_axis] + shifted_lazy_axis_idx = 0 if isinstance(idx, int) else slice( + 0, lazy_axis_size, 1) + return indexes[:self._lazy_axis] + ( + shifted_lazy_axis_idx, ) + indexes[self._lazy_axis + 1:] + + def __getitem__(self, indexes): + indexes = _multi_index(indexes, self.shape) + + if isinstance(indexes[self._lazy_axis], int): + other_indexes = indexes[:self._lazy_axis] + \ + indexes[self._lazy_axis+1:] + return self._cached_evaluate( + indexes[self._lazy_axis])[other_indexes] + lazy_indexes = self._lazy_indexes(indexes) + # Using concat and reshape to replace stack operator temporarily, as + # it is not a primitive operator. + shape = list(self.shape) + shape[self._lazy_axis] = len(lazy_indexes) + part_jac = paddle.concat( + [self._cached_evaluate(i) for i in lazy_indexes], + axis=self._lazy_axis).reshape(shape) + return part_jac[self._shifted_indexes(indexes, len(lazy_indexes))] + + def _cached_evaluate(self, k): + v = self._cache.get(k) + if v is None: + v = self._evaluate(k) + self._cache[k] = v + return v + + def _evaluate(self, index): + """Evaluate one slice at along lazy axis.""" + raise NotImplementedError + + +class _JacobianNoBatch(_Jacobian): + """Compute Jacobian matrix without batch dimension. + Suppose the mapping is :math:`f: R^M \to R^N`, the output shape is + ``(N, M)`` . + """ + + def __init__(self, func, xs): + super(_JacobianNoBatch, self).__init__(func, xs) + + @property + def shape(self): + return (self._flatten_ys.shape[0], self._flatten_xs.shape[0]) + + @property + def _lazy_axis(self): + return 0 + + def _flatten(self, xs): + return paddle.concat(tuple(x.reshape((-1, )) for x in xs)) + + def _evaluate(self, row_index): + return self._flatten(_grad( + self._flatten_ys[row_index], + self._xs, + )) + + +class _JacobianBatchFirst(_Jacobian): + """Compute Jacobian matrix with batch at first axis. + Suppose the mapping is :math:`f: R^{B,M} \to R^{B,N}`, the output shape is + ``(B, N, M)`` . + """ + + def __init__(self, func, xs): + super(_JacobianBatchFirst, self).__init__(func, xs) + + @property + def shape(self): + return (self._flatten_xs.shape[0], self._flatten_ys.shape[1], + self._flatten_xs.shape[1]) + + @property + def _lazy_axis(self): + return 1 + + def _flatten(self, xs): + return paddle.concat( + tuple(x.reshape((x.shape[0], -1)) for x in utils.as_tensors(xs)), 1) + + def _evaluate(self, row_index): + return self._flatten(_grad(self._flatten_ys[:, row_index], self._xs)) + + +def _multi_index(indexes, shape): + """A tool for parsing N-dimensional index into a standard format. + + Currently supporting following input format: + * ([positive|negative|slice], ...), the right-most elements can be + omited. + + The standard format after converted is slice tuple which contains N elements: + * ([positive|slice], ..., [positive|slice]) + + Notes: + Ellipsis indexes such as ``(..., i), (i, ...)`` is not supported. + + Args: + indexes (tuple): The input indexes. + shape (tuple): The input shape. + + Returns: + tuple: The standard format index as the above description. + """ + indexes = indexes if isinstance(indexes, typing.Sequence) else (indexes, ) + if any(isinstance(i, type(Ellipsis)) for i in indexes): + raise IndexError('Ellipsis index currently is not supported.') + # Fill the right-most elements. + indexes = indexes + (slice(0, None, None), ) * (len(shape) - len(indexes)) + # Convert to positive index. + positive_indexes = [] + for i, index in enumerate(indexes): + if isinstance(index, slice): + index = slice(index.start or 0, index.stop or shape[i], index.step + or 1) + positive_indexes.append( + slice( + index.start + shape[i] if index.start < 0 else index.start, + index.stop + shape[i] if index.stop < 0 else index.stop, + # Negative step means index backward, no need to convert to + # positive interger. + index.step)) + elif isinstance(index, int): + positive_indexes.append(index + shape[i] if index < 0 else index) + else: + raise TypeError(f'Not supported index type {index}.') + return tuple(positive_indexes) + + +def _replace_none_with_zero_tensor(xs, refs): + if xs is None: + xs = paddle.zeros_like(refs) + xs.stop_gradient = refs.stop_gradient + return xs + elif isinstance(xs, typing.Sequence): + return tuple( + _replace_none_with_zero_tensor(x, refs[i]) + for i, x in enumerate(xs)) + else: + return xs + + +def _grad(ys, xs, v=None): + """A gradient function that can be used in dynamic graph and static graph. + + The ``grad`` combines ``paddle.grad`` used in dynamic graph and + ``paddle.static.gradients`` used in static graph, and do following changes: + + * The ``allow_unused`` flag is removed and set defaults to true internally, + none in outputs will be replaced by zero tensor. + * The ``create_graph`` flag is removed and set defaults to true internally, + only makes sense in dynamic graph. + * When xs is a single Tensor, ``paddle.grad`` returns a list which only + contains one Tensor. It may confuse users, thus in this case we improve + to return a single Tensor in _grad interface. + + Args: + ys (Tensor|Sequence[Tensor]): The output tensor or tensor sequence of + the graph to compute gradients. + xs (Tensor|Sequence[Tensor]): The input tensor or tensor sequence of the graph to + compute gradients. The returned values of this API are the + gradients of inputs . + v (Tensor|Sequence[Tensor]|None,optional): The initial gradient values + of outputs . If grad_outputs is None, the initial gradient values of + outputs would be Tensors filled with 1; if grad_outputs is not None, + it must have the same length as outputs , and in this case, the + initial gradient value of the i-th outputs would be: (1) a Tensor + filled with 1 when the i-th element of grad_outputs is None; + (2) the i-th element of grad_outputs when the i-th element of + grad_outputs is a Tensor. Default None. + + Returns: + Tensor|tuple[Tensor]: Tensor or a tuple of Tensors, whose length is the + same as the Tensor number inside inputs, and the i-th returned + Tensor is the sum of gradients of outputs with respect to the i-th + inputs. + """ + if paddle.fluid._non_static_mode(): + xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True) + else: + xs_grad = paddle.incubate.autograd.grad(ys, xs, v) + + if isinstance(xs, paddle.fluid.framework.Variable): + xs_grad = xs_grad[0] + + return _replace_none_with_zero_tensor(xs_grad, xs) + + +def _separate(xs): + """ + ``_separate`` separates ``xs`` from the computation graph through ``clone`` + or ``deteach`` . + + Interally, ``paddle.grad(xs, ys)`` is stateful API implemented based on + computional graph, which will reduce gradients along all path from ys to xs. + + However, funcional autograd API such as ``vjp``, ``jvp`` is stateless, and + only compute gradients with a given ``func`` . + + For example, given a ``func`` :math:`y0=f(x0)`, supposing forward path is: + ``x0 -> y0``, ``x0 -> x1 -> y0`` . + ``paddle.grad(y0, x0)`` will reduce gradients along ``y0->x0`` and + ``y0->x1->x0``, and ``vjp`` only need reduce along ``y0->x0``. + + So, it's needed to clone or detach xs for breaking the dependencies with + other variables. + + Examples: + + .. code-block:: python + + import paddle + from paddle.autograd.functional import _separate + + + def func(x, y): + return x * y + + + x = paddle.ones((1,)) + x.stop_gradient = False + + y = func(x, x) + print(paddle.grad(y, x)) + # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [2.])] + + x1, x2 = _separate((x, x)) + y = func(x1, x2) + print(paddle.grad(y, x1)) + # [Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [1.])] + + """ + if isinstance(xs, typing.Sequence): + return tuple(_single_separate(x) for x in xs) + else: + return _single_separate(xs) + + +def _single_separate(x): + if x is None: # x maybe none because grad input's v defaults to none. + return x + if not x.stop_gradient: + return paddle.clone(x) + else: # use detach to share memory when no need gradients. + x = x.detach() + x.stop_gradient = False + return x + return x + + +def _check_inputs(func, xs, v=None): + if not callable(func): + raise TypeError(f"Expected 'fun' is Callable, but got {type(func)}.") + + if not isinstance(xs, (framework.Variable, typing.Sequence)): + raise TypeError(f"Expected 'xs' is a Tensor|Sequence[Tensor]," + f"but got {type(xs)}.") + if isinstance(xs, typing.Sequence) and not all( + isinstance(x, framework.Variable) for x in xs): + raise TypeError("All elements of 'xs' shoule be Tensor.") + + if not isinstance(v, (framework.Variable, typing.Sequence, type(None))): + raise TypeError( + f"Expected 'v' is Tensor|Sequence[Tensor]|None, but got {type(v)}.") + + if isinstance(v, typing.Sequence) and not all( + isinstance(e, framework.Variable) for e in v): + raise TypeError("All elements of 'xs' shoule be Tensor.") + + +def _check_v_shape(v, refs): + if v is None: + return + + v, refs = utils.as_tensors(v), utils.as_tensors(refs) + if len(refs) != len(v): + raise RuntimeError(f"The argument v is a tuple of invalid length:" + f"should be {len(refs)} but got {len(v)}.") + + for index, (element_v, element_ref) in enumerate(zip(v, refs)): + if element_v.shape != element_ref.shape: + raise RuntimeError( + f"The v[{index}] has invalid shape: should " + f"be {element_ref.shape} but got {element_v.shape}.") diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py index 75a70b09731f2..5b3ad0dd78a3b 100644 --- a/python/paddle/incubate/autograd/primapi.py +++ b/python/paddle/incubate/autograd/primapi.py @@ -14,28 +14,26 @@ import typing -import paddle.autograd.utils as tensor_utils -import paddle.incubate.autograd.utils as prim_utils -from paddle.fluid import framework -from paddle.incubate.autograd import primx +from paddle.fluid import backward, framework +from paddle.incubate.autograd import primx, utils @framework.static_only -def forward_gradients(targets, inputs, input_gradients=None): +def forward_grad(outputs, inputs, grad_inputs=None): """Forward mode of automatic differentiation. .. note:: **ONLY available in the static mode and primitive operators.** Args: - targets: The target tensor or tensors + outputs: The output tensor or tensors inputs: The input tensor or tensors - input_gradients: The gradient Tensor or Tensors of inputs which has + grad_inputs: The gradient Tensor or Tensors of inputs which has the same shape with inputs, Defaults to None, in this case is equivalent to all ones . Returns: - target_gradients (Tensor|Sequence[Tensor]): The gradients for targets. + grad_outputs (Tensor|Sequence[Tensor]): The gradients for outputs. Examples: @@ -53,7 +51,7 @@ def forward_gradients(targets, inputs, input_gradients=None): with paddle.static.program_guard(main_program, startup_program): x = paddle.static.data('x', shape=[1], dtype='float32') y = x * x - y_grad = paddle.incubate.autograd.forward_gradients(y, x) + y_grad = paddle.incubate.autograd.forward_grad(y, x) paddle.incubate.autograd.prim2orig() exe = paddle.static.Executor() @@ -65,20 +63,20 @@ def forward_gradients(targets, inputs, input_gradients=None): paddle.incubate.autograd.disable_prim() paddle.disable_static() """ - if not prim_utils.prim_enabled(): - raise RuntimeError('forward_gradients must be running on primitive' + if not utils.prim_enabled(): + raise RuntimeError('forward_grad must be running on primitive' 'operators, use enable_prim to turn it on.') - if not isinstance(targets, (framework.Variable, typing.Sequence)): - raise TypeError(f'Expected targets is Tensor|Sequence[Tesnor], ' - f'but got {type(targets)}.') + if not isinstance(outputs, (framework.Variable, typing.Sequence)): + raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], ' + f'but got {type(outputs)}.') if not isinstance(inputs, (framework.Variable, typing.Sequence)): raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], ' f'but got {type(inputs)}.') - ys, xs, xs_dot = tensor_utils.as_tensors(targets), tensor_utils.as_tensors( - inputs), tensor_utils.as_tensors(input_gradients) + ys, xs, xs_dot = utils.as_tensors(outputs), utils.as_tensors( + inputs), utils.as_tensors(grad_inputs) block = framework.default_main_program().current_block() if any(x.block != block for x in xs + ys): @@ -90,4 +88,95 @@ def forward_gradients(targets, inputs, input_gradients=None): ad = primx.Transform(ys[0].block) _, ys_dot = ad.linearize(xs, ys, xs_dot) - return ys_dot[0] if isinstance(targets, framework.Variable) else ys_dot + return ys_dot[0] if isinstance(outputs, framework.Variable) else ys_dot + + +@framework.static_only +def grad(outputs, inputs, grad_outputs=None): + """Reverse mode of automatic differentiation. + + .. note:: + **ONLY available in the static mode and primitive operators** + + Args: + outputs (Tensor|Sequence[Tensor]): The output Tensor or Tensors. + inputs (Tensor|Sequence[Tensor]): The input Tensor or Tensors. + grad_outputs (Tensor|Sequence[Tensor]): The gradient Tensor or + Tensors of outputs which has the same shape with outputs, Defaults + to None, in this case is equivalent to all ones . + + Returns: + grad_inputs (Tensor|Tensors): The gradients for inputs. + + Examples: + + .. code-block:: python + + import numpy as np + import paddle + paddle.enable_static() + paddle.incubate.autograd.enable_prim() + startup_program = paddle.static.Program() + main_program = paddle.static.Program() + with paddle.static.program_guard(main_program, startup_program): + x = paddle.static.data('x', shape=[1], dtype='float32') + x.stop_gradients = False + y = x * x + x_grad = paddle.incubate.autograd.grad(y, x) + paddle.incubate.autograd.prim2orig() + exe = paddle.static.Executor() + exe.run(startup_program) + x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad]) + print(x_grad) + # [array([4.], dtype=float32)] + paddle.incubate.autograd.disable_prim() + paddle.disable_static() + """ + + if not utils.prim_enabled(): + return backward.gradients(outputs, inputs, grad_outputs) + + if not isinstance(outputs, (framework.Variable, typing.Sequence)): + raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], ' + f'but got {type(outputs)}.') + + if not isinstance(inputs, (framework.Variable, typing.Sequence)): + raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], ' + f'but got {type(inputs)}.') + + ys, xs, ys_bar = utils.as_tensors(outputs), utils.as_tensors( + inputs), utils.as_tensors(grad_outputs) + block = framework.default_main_program().current_block() + if any((x is not None and x.block != block) for x in xs + ys): + raise RuntimeError( + 'Variable in inputs and outputs should be None or in current block of main program' + ) + + # TODO(Tongxin) without any prior knowledge about whether the program + # is completely lowered to primitive ops, it's mandatory to run the lowering + # pass once and again. This is obviously inefficient and needs to be + # optimized. + primx.orig2prim(block) + ad = primx.Transform(block) + xs_dot, ys_dot = ad.linearize(xs, ys) + if any(var is None for var in ys_dot): + raise RuntimeError( + 'Grads cannot be computed. The given outputs does not depend on inputs' + ) + ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar) + + # remove xs_dot and their constructor ops + op_indexes = [] + for var in xs_dot: + if var is not None: + op_index = block.ops.index(var.op) + if op_index < 0: + raise ValueError( + f'op_index should be greater than or equal to 0, but op_index={op_index}.' + ) + op_indexes.append(op_index) + + ad.erase_ops(sorted(op_indexes)) + ad.erase_dots(xs_dot) + + return xs_bar[0] if isinstance(inputs, framework.Variable) else xs_bar diff --git a/python/paddle/incubate/autograd/primops.py b/python/paddle/incubate/autograd/primops.py index 6017ac3598920..b9a3ac459961a 100644 --- a/python/paddle/incubate/autograd/primops.py +++ b/python/paddle/incubate/autograd/primops.py @@ -14,6 +14,7 @@ import paddle from paddle.fluid.layer_helper import LayerHelper + from .primreg import REGISTER_FN diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py index d5037dcf64994..260a97cdc16a4 100644 --- a/python/paddle/incubate/autograd/primx.py +++ b/python/paddle/incubate/autograd/primx.py @@ -22,7 +22,7 @@ from .primrules import _orig2prim, _prim2orig, _jvp, _transpose from .utils import get_input_var_list, get_output_var_list, flatten, flatten_and_remove_none from collections import OrderedDict -from paddle.autograd.utils import as_tensors +from paddle.incubate.autograd.utils import as_tensors def topo_path(xs, ys, block=None): @@ -577,47 +577,3 @@ def prim2orig(block=None): assert block == default_main_program().current_block( ), f'block is neither None nor current block of main program' _lower(block, reverse=True) - - -def _gradients(ys, xs, ys_bar=None): - """ A drop-in replacement of paddle.gradients but instead computing - on primitive ops. - - Args: - ys: the target tensor or tensors - xs: the input tensor or tensors - ys_bar: the optional gradient tensors of `ys` - - Returns: - xs_bar: a list gradients of input `xs` - """ - - ys, xs, ys_bar = as_tensors(ys), as_tensors(xs), as_tensors(ys_bar) - block = default_main_program().current_block() - for el in xs + ys: - assert el is None or el.block == block, f'variable in xs and ys should be None or in current block of main program' - # TODO(Tongxin) without any prior knowledge about whether the program - # is completely lowered to primitive ops, it's mandatory to run the lowering - # pass once and again. This is obviously inefficient and needs to be - # optimized. - orig2prim(block) - - ad = Transform(block) - - xs_dot, ys_dot = ad.linearize(xs, ys) - if any(var is None for var in ys_dot): - assert False, f'Gradients cannot be computed. The given output `ys` does not depend on input `xs`.' - ys_bar, xs_bar = ad.transpose(ys_dot, xs_dot, ys_bar) - # remove xs_dot and their constructor ops - - op_indexes = [] - for var in xs_dot: - if var is not None: - op_index = block.ops.index(var.op) - assert op_index >= 0, f'op_index should be greater than or equal to 0, but op_index={op_index}.' - op_indexes.append(op_index) - - ad.erase_ops(sorted(op_indexes)) - ad.erase_dots(xs_dot) - - return xs_bar diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py index 9d6a8c4f6a36d..96faf7f7440ca 100644 --- a/python/paddle/incubate/autograd/utils.py +++ b/python/paddle/incubate/autograd/utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import typing import paddle from paddle.fluid import framework as framework @@ -170,3 +171,12 @@ def flatten(inp): def flatten_and_remove_none(inp): flattened = flatten(inp) return [var for var in flattened if var is not None] + + +def as_tensors(xs): + if isinstance(xs, framework.Variable): + return (xs, ) + elif isinstance(xs, typing.Sequence): + return tuple(xs) + else: + return xs From 0a04b8a9980e3a409642201707c0f1d95be4c5d8 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Mon, 11 Jul 2022 20:27:40 +0800 Subject: [PATCH 128/250] [IPU] support more ops 0/N (#44204) * add authors Co-authored-by: Allen Guo Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen * squash cpp changes 1/N * clean code Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen --- .../ir/ipu/optimizer_extract_pass.cc | 13 - .../ir/ipu/popart_canonicalization_pass.cc | 11 + .../fluid/platform/device/ipu/ipu_compiler.cc | 39 +- .../canonicalization_utils.cc | 53 ++ .../canonicalization_utils.h | 7 + .../ipu/popart_canonicalization/logic_ops.cc | 34 ++ .../ipu/popart_canonicalization/loss_ops.cc | 508 ++++++++++++++++++ .../ipu/popart_canonicalization/math_ops.cc | 287 +++------- .../ipu/popart_canonicalization/op_builder.cc | 34 +- .../ipu/popart_canonicalization/op_builder.h | 6 + .../ipu/popart_canonicalization/other_ops.cc | 12 - .../ipu/popart_canonicalization/reduce_ops.cc | 52 ++ .../device/ipu/supported_ops_autogen.h | 1 + 13 files changed, 795 insertions(+), 262 deletions(-) create mode 100644 paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index f28696194e5f6..b45a39aaa8680 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -287,19 +287,6 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { } else if (op_role == OpRole::kLRSched) { // op_role == OpRole::kLRSched | OpRole::kOptimize new_op.SetAttr("with_lr_sched", true); - } else if (op_type == "identity_loss") { - auto outputs = op->Outputs(); - PADDLE_ENFORCE_EQ( - outputs.size(), - 1, - platform::errors::InvalidArgument("Can only support one loss key")); - auto losses = outputs.begin()->second; - PADDLE_ENFORCE_EQ( - losses.size(), - 1, - platform::errors::InvalidArgument("Can only support one loss name")); - auto loss_var = losses.front(); - new_op.SetAttr("loss_var", loss_var); } } diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc index 6806e44f09505..222ca619c223f 100644 --- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc +++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc @@ -30,8 +30,13 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const { auto custom_ops = Get>("custom_ops"); std::vector missing_ops; auto sorted_ops = TopologySortOperations(*graph); + std::unordered_set delete_nodes; for (auto* node : sorted_ops) { auto* op = node->Op(); + if (platform::ipu::IsMarkedForDeletion(node)) { + delete_nodes.insert(node); + continue; + } auto op_type = op->Type(); ir::Node* new_node = nullptr; @@ -67,6 +72,12 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const { "Found unimplemented op_handler(s) for IPU")); } + for (auto* node : delete_nodes) { + // TODO(czr): possible remove + platform::ipu::ClearNode(node); + graph->RemoveNode(node); + } + // post popart_canonicalization VLOG(10) << "Post Graph: "; diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index 930af7e1470fc..09e68ab518746 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -445,6 +445,7 @@ void Compiler::LowerWeights(const Scope* scope) { for (size_t i = 0; i < tensor.dims().size(); ++i) { shape.push_back(tensor.dims().at(i)); } + popart::TensorInfo tensor_info(dtype, shape); popart::ConstVoidData const_data{tensor.data(), tensor_info}; if (!node->outputs.empty()) { @@ -530,21 +531,26 @@ void Compiler::LowerOptimizer(const Scope* scope) { auto raw_type = BOOST_GET_CONST(std::string, op_desc->GetAttr("raw_type")); resources_->optimizer_type = raw_type; - auto loss_var = - BOOST_GET_CONST(std::string, op_desc->GetAttr("loss_var")); - resources_->loss_var = resources_->tensors[loss_var]; resources_->with_lr_sched = BOOST_GET_CONST(bool, op_desc->GetAttr("with_lr_sched")); if (ipu_strategy_->is_dynamic) { + // loss_var in dy2static is set by identity_loss. And lr is + // passed by ipu_strategy. resources_->lr = ipu_strategy_->lr; - } else if (op_desc->HasAttr("lr_var")) { - auto lr_var = BOOST_GET_CONST(std::string, op_desc->GetAttr("lr_var")); - resources_->lr_var = lr_var; - resources_->lr = GetSingleVarFromScope(scope, lr_var); } else { - // adadelta has no lr - resources_->lr = 0.01f; - resources_->with_lr_sched = false; + auto loss_var = + BOOST_GET_CONST(std::string, op_desc->GetAttr("loss_var")); + resources_->loss_var = resources_->tensors[loss_var]; + if (op_desc->HasAttr("lr_var")) { + auto lr_var = + BOOST_GET_CONST(std::string, op_desc->GetAttr("lr_var")); + resources_->lr_var = lr_var; + resources_->lr = GetSingleVarFromScope(scope, lr_var); + } else { + // adadelta has no lr + resources_->lr = 0.01f; + resources_->with_lr_sched = false; + } } VLOG(10) << "Set initial lr: " << resources_->lr; @@ -766,6 +772,19 @@ void Compiler::LowerOptimizer(const Scope* scope) { PADDLE_THROW(platform::errors::Unimplemented( "optimizer %s is not implemented", type)); } + } else if (op_type == "popart_identity_loss") { + auto outputs = op_desc->Outputs(); + PADDLE_ENFORCE_EQ( + outputs.size(), + 1, + platform::errors::InvalidArgument("Can only support one loss key")); + auto losses = outputs.begin()->second; + PADDLE_ENFORCE_EQ( + losses.size(), + 1, + platform::errors::InvalidArgument("Can only support one loss name")); + auto loss_var = losses.front(); + resources_->loss_var = resources_->tensors[loss_var]; } } } diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc index 44fdf764c5bcc..c4960616b9db0 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc @@ -138,6 +138,59 @@ const ONNXDataType GetOutputVarDType(const Node *node, return GetVarDType(out_node); } +bool IsLastVarNode(Node *node) { + return node->IsVar() && node->outputs.size() == 0; +} + +void MarkNodeForDeletion(Node *node) { node->Op()->SetAttr("delete_node", 1); } + +bool IsMarkedForDeletion(Node *node) { + return node->Op()->HasAttr("delete_node") && + BOOST_GET_CONST(int, node->Op()->GetAttr("delete_node")) > 0; +} + +int RemoveTailReduction(Graph *graph, + Node *loss_op, + const std::string &output_var_name) { + // Sum: 0. Mean: 1. None: 2 + int reduction = 2; + Node *reduction_op; + auto loss_output = GetOutputVarNode(output_var_name, loss_op); + for (auto sub_node : loss_output->outputs) { + if (!sub_node->IsOp()) continue; + if (sub_node->Op()->Type() == "reduce_sum") { + reduction = 0; + reduction_op = sub_node; + } else if (sub_node->Op()->Type() == "reduce_mean") { + reduction = 1; + reduction_op = sub_node; + } + } + if (reduction == 2) return reduction; + auto reduction_out = reduction_op->outputs[0]; + loss_op->Op()->SetOutput(output_var_name, + std::vector({reduction_out->Name()})); + MarkNodeForDeletion(reduction_op); + DisConnectNodes(loss_output, reduction_op); + DisConnectNodes(reduction_op, reduction_out); + ConnectNodes(loss_op, reduction_out); + + return reduction; +} + +int ConvertToPopartReduction(const std::string &reduction) { + // Sum: 0. Mean: 1. None: 2 + if (reduction == "sum") { + return 0; + } else if (reduction == "mean") { + return 1; + } else if (reduction == "none") { + return 2; + } + PADDLE_THROW(platform::errors::InvalidArgument( + "reduction %s is not supported on ipu.", reduction)); +} + } // namespace ipu } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h index 536b69a39b9a1..611d863c496a8 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h @@ -85,6 +85,13 @@ const bool is_float_equal(float a, float b, float eps = 1e-8); const ONNXDataType GetVarDType(const Node *node); const ONNXDataType GetOutputVarDType(const Node *node, const std::string &output_name = "Out"); +void MarkNodeForDeletion(Node *node); +bool IsMarkedForDeletion(Node *node); +bool IsLastVarNode(Node *node); +int RemoveTailReduction(Graph *graph, + Node *loss_op, + const std::string &output_var_name); +int ConvertToPopartReduction(const std::string &reduction); } // namespace ipu } // namespace platform diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc index c10a30997a4da..155c11b03b8fc 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc @@ -91,6 +91,38 @@ Node *less_than_handler(Graph *graph, Node *node) { {}); } +Node *greater_equal_handler(Graph *graph, Node *node) { + auto less_op = + CreateBaseOp(graph, + node, + "popart_less", + {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, + {}, + {}); + return CreateBaseOp(graph, + node, + "popart_logical_not", + less_op->outputs, + {GetOutputVarNode("Out", node)}, + {}); +} + +Node *less_equal_handler(Graph *graph, Node *node) { + auto less_op = + CreateBaseOp(graph, + node, + "popart_greater", + {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, + {}, + {}); + return CreateBaseOp(graph, + node, + "popart_logical_not", + less_op->outputs, + {GetOutputVarNode("Out", node)}, + {}); +} + } // namespace } // namespace ipu } // namespace platform @@ -103,3 +135,5 @@ REGISTER_HANDLER(logical_or, logical_or_handler); REGISTER_HANDLER(logical_and, logical_and_handler); REGISTER_HANDLER(greater_than, greater_than_handler); REGISTER_HANDLER(less_than, less_than_handler); +REGISTER_HANDLER(greater_equal, greater_equal_handler); +REGISTER_HANDLER(less_equal, less_equal_handler); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc new file mode 100644 index 0000000000000..438304fcfc709 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/loss_ops.cc @@ -0,0 +1,508 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h" +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace ipu { +namespace { + +bool is_dynamic_graph() { + auto *ipu_backend = platform::ipu::IpuBackend::GetInstance(); + return ipu_backend->GetIpuStrategy()->is_dynamic; +} + +Node *identity_loss_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction")); + return CreateIdentityLossOp( + graph, node, node->inputs, node->outputs, reduction); +} + +Node *cross_entropy_general_handler(Graph *graph, + Node *node, + Node *logits, + Node *label, + Node *output, + bool soft_label, + int ignore_index, + int reduction, + int axis) { + Node *cast_and_reshape = nullptr; + Node *final_loss_node = nullptr; + if (soft_label) { + PADDLE_THROW(platform::errors::InvalidArgument( + "soft_label is not supported yet in IPU")); + } + bool append_identity_loss = is_dynamic_graph(); + bool is_last_var_node = IsLastVarNode(output); + append_identity_loss = append_identity_loss && is_last_var_node; + + if (label->Var()->GetDataType() == framework::proto::VarType::INT32) { + cast_and_reshape = label; + } else { + cast_and_reshape = + CreateCast(graph, node, {label}, {}, framework::proto::VarType::INT32) + ->outputs.front(); + } + + auto label_shape_ = label->Var()->GetShape(); + auto logits_shape_ = logits->Var()->GetShape(); + + axis = axis < 0 ? logits_shape_.size() + axis : axis; + + auto label_transposed(label_shape_); + + if (axis != (logits_shape_.size() - 1)) { + // the softmax axis(a) is not at the last dimension. + // logit shape: [N1, ..., C, ..., Nk] + // label shape: [N1, ..., 1, ..., Nk] + // _____^_____ + // dim: 0, ..., a, ..., k-1 + // needs to transpose the softmax axis in logit to last dimension + // with following transpose perm: [0, ..., a-1, a+1, ..., k-1, a] + std::vector trans(logits_shape_.size(), 0); + std::iota(trans.begin(), trans.begin() + axis, 0); + std::iota(trans.begin() + axis, trans.end() - 1, axis + 1); + trans.back() = axis; + + // transpose logits + logits = + CreateBaseOp( + graph, node, "popart_transpose", {logits}, {}, {{"perm", trans}}) + ->outputs.front(); + + // no need to transpose label, transform the label size and reshape later. + std::transform( + trans.cbegin(), + trans.cend(), + label_transposed.begin(), + [&label_shape_](int64_t index) { return label_shape_[index]; }); + } + + if (label_transposed.back() == 1) { + // input shape: [N1, N2, ... , Nk, C] + // label shape: [N1, N2, ... , Nk, 1] + // reshape label shape to [N1, N2, ... , Nk] + std::vector new_shape_(label_transposed.begin(), + label_transposed.end() - 1); + auto const_before_loss = + CreateBaseOp( + graph, + node, + "popart_constant", + {}, + {}, + {{"value", new_shape_}, + {"dims", + std::vector{static_cast(new_shape_.size())}}, + {"dtype", ONNXDataType::INT64}}) + ->outputs.front(); + + cast_and_reshape = CreateBaseOp(graph, + node, + "popart_reshape", + {cast_and_reshape, const_before_loss}, + {}, + {}) + ->outputs.front(); + } + + auto log = CreateBaseOp(graph, node, "popart_log", {logits}, {}, {}) + ->outputs.front(); + + bool reshape_back = reduction == 2 && label_transposed.back() == 1; + + final_loss_node = CreateBaseOp(graph, + node, + "popart_nllloss_v2", + {log, cast_and_reshape}, + !(reshape_back || append_identity_loss) + ? std::vector{output} + : std::vector{}, + { + {"reduction", reduction}, + {"ignoreIndex", ignore_index}, + {"inputIsLogProbability", true}, + }) + ->outputs.front(); + + if (reshape_back) { + // reshape output to the shape of input label. + auto const_after_loss = + CreateBaseOp( + graph, + node, + "popart_constant", + {}, + {}, + {{"value", label_shape_}, + {"dims", + std::vector{static_cast(label_shape_.size())}}, + {"dtype", ONNXDataType::INT64}}) + ->outputs.front(); + final_loss_node = + CreateBaseOp(graph, + node, + "popart_reshape", + {final_loss_node, const_after_loss}, + append_identity_loss ? std::vector{} + : std::vector{output}, + {}) + ->outputs.front(); + } + + if (append_identity_loss) { + final_loss_node = + CreateIdentityLossOp(graph, node, {final_loss_node}, {output}, 2); + } + + return final_loss_node; +} + +Node *cross_entropy2_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + int reduction = RemoveTailReduction(graph, node, "Y"); + auto logits = GetInputVarNode("X", node); + auto label = GetInputVarNode("Label", node); + auto output = GetOutputVarNode("Y", node); + auto ignore_index = BOOST_GET_CONST(int, op->GetAttr("ignore_index")); + return cross_entropy_general_handler(graph, + node, + logits, + label, + output, + false, /*soft_label*/ + ignore_index, + reduction, + -1); /*axis*/ +} + +Node *softmax_with_cross_entropy_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + int reduction = RemoveTailReduction(graph, node, "Loss"); + auto logits = GetInputVarNode("Logits", node); + auto label = GetInputVarNode("Label", node); + auto output = GetOutputVarNode("Loss", node); + auto ignore_index = BOOST_GET_CONST(int, op->GetAttr("ignore_index")); + auto axis = BOOST_GET_CONST(int, op->GetAttr("axis")); + auto soft_label = BOOST_GET_CONST(bool, op->GetAttr("soft_label")); + + logits = CreateSoftmaxOpset11( + graph, node, {logits}, {GetOutputVarNode("Softmax", node)}, axis) + ->outputs.front(); + return cross_entropy_general_handler(graph, + node, + logits, + label, + output, + soft_label, + ignore_index, + reduction, + axis); +} + +Node *kldiv_loss_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto reduction = ConvertToPopartReduction( + BOOST_GET_CONST(std::string, op->GetAttr("reduction"))); + if (reduction == 2) { + reduction = RemoveTailReduction(graph, node, "Loss"); + } + bool append_identity_loss = is_dynamic_graph(); + bool is_last_var_node = IsLastVarNode(GetOutputVarNode("Loss", node)); + append_identity_loss = append_identity_loss && is_last_var_node; + + // log(pred) + auto log = + CreateBaseOp( + graph, node, "popart_log", {GetInputVarNode("Target", node)}, {}, {}) + ->outputs.front(); + + // log(pred) - label + auto log_minus = + CreateBaseOp( + graph, node, "popart_sub", {log, GetInputVarNode("X", node)}, {}, {}) + ->outputs.front(); + + // label * (log(pred) - label) + auto loss = + CreateBaseOp(graph, + node, + "popart_mul", + {GetInputVarNode("Target", node), log_minus}, + append_identity_loss || reduction != 2 + ? std::vector{} + : std::vector{GetOutputVarNode("Loss", node)}, + {}); + + auto attrs = AttributeMap{{"reduce_all", true}, {"keepdims", 0L}}; + if (append_identity_loss) { + loss = CreateIdentityLossOp(graph, + node, + loss->outputs, + {GetOutputVarNode("Loss", node)}, + reduction); + } else if (reduction == 0) { + // Sum + loss = CreateBaseOp(graph, + node, + "popart_reducesum", + loss->outputs, + {GetOutputVarNode("Loss", node)}, + attrs); + } else if (reduction == 1) { + // Mean + loss = CreateBaseOp(graph, + node, + "popart_reducemean", + loss->outputs, + {GetOutputVarNode("Loss", node)}, + attrs); + } + return loss; +} + +Node *binary_cross_entropy_handler(Graph *graph, Node *node) { + // Out = -1 * weight * (label * log(x) + (1 - label) * log(1 - x)) + int reduction = 2; + if (is_dynamic_graph()) { + reduction = RemoveTailReduction(graph, node, "Out"); + } + bool append_identity_loss = + is_dynamic_graph() && IsLastVarNode(GetOutputVarNode("Loss", node)); + + auto x = GetInputVarNode("X", node); + auto label = GetInputVarNode("Label", node); + // log(x) + auto log = + CreateBaseOp(graph, node, "popart_log", {x}, {}, {})->outputs.front(); + + // label * log(x) + auto log_mul = CreateBaseOp(graph, node, "popart_mul", {label, log}, {}, {}) + ->outputs.front(); + + // const one + auto one = + CreateConst(graph, node, std::vector{1.0}, {1}, GetVarDType(x)) + ->outputs.front(); + // (1 - x) + auto minus_input = CreateBaseOp(graph, node, "popart_sub", {one, x}, {}, {}) + ->outputs.front(); + + // log(1 - x) + auto log_minus_input = + CreateBaseOp(graph, node, "popart_log", {minus_input}, {}, {}) + ->outputs.front(); + + // (1 - label) + auto minus_label = + CreateBaseOp(graph, node, "popart_sub", {one, label}, {}, {}) + ->outputs.front(); + + // (1 - label) * log(1 - x) + auto minus_log_mul = + CreateBaseOp( + graph, node, "popart_mul", {minus_label, log_minus_input}, {}, {}) + ->outputs.front(); + + // (label * log(x) + (1 - label) * log(1 - x)) + auto add = + CreateBaseOp(graph, node, "popart_add", {log_mul, minus_log_mul}, {}, {}) + ->outputs.front(); + + // -1 * (label * log(x) + (1 - label) * log(1 - x)) + auto loss = CreateBaseOp( + graph, + node, + "popart_neg", + {add}, + append_identity_loss ? std::vector{} + : std::vector{GetOutputVarNode("Out", node)}, + {}); + if (append_identity_loss) { + loss = CreateIdentityLossOp( + graph, node, loss->outputs, {GetOutputVarNode("Out", node)}, reduction); + } + return loss; +} + +Node *huber_loss_handler(Graph *graph, Node *node) { + // if abs(label - input) < delta + // huber_loss = 0.5 * (label - input) * (label - input) + // else + // huber_loss = delta * abs(label - input) - 0.5 * delta * delta + auto *op = node->Op(); + int reduction = 2; + if (is_dynamic_graph()) { + reduction = RemoveTailReduction(graph, node, "Out"); + } + bool append_identity_loss = + is_dynamic_graph() && IsLastVarNode(GetOutputVarNode("Out", node)); + + auto x = GetInputVarNode("X", node); + auto label = GetInputVarNode("Y", node); + // (label - input) + auto diff = CreateBaseOp(graph, node, "popart_sub", {label, x}, {}, {}) + ->outputs.front(); + + // abs(label - input) + auto abs_diff = + CreateBaseOp(graph, node, "popart_abs", {diff}, {}, {})->outputs.front(); + + // const 0.5 + auto dot_five = + CreateConst(graph, node, std::vector{0.5}, {1}, GetVarDType(x)) + ->outputs.front(); + + // const delta + auto delta_value = BOOST_GET_CONST(float, op->GetAttr("delta")); + auto delta = + CreateConst( + graph, node, std::vector{delta_value}, {1}, GetVarDType(x)) + ->outputs.front(); + auto delta_square_coff = + CreateConst(graph, + node, + std::vector{0.5f * delta_value * delta_value}, + {1}, + GetVarDType(x)) + ->outputs.front(); + + // (label - input) * (label - input) + auto square = CreateBaseOp(graph, node, "popart_mul", {diff, diff}, {}, {}) + ->outputs.front(); + + // 0.5 * (label - input) * (label - input) + auto dot_five_square = + CreateBaseOp(graph, node, "popart_mul", {dot_five, square}, {}, {}) + ->outputs.front(); + + // delta * abs(label - input) + auto delta_mul_diff = + CreateBaseOp(graph, node, "popart_mul", {delta, abs_diff}, {}, {}) + ->outputs.front(); + + // delta * abs(label - input) - 0.5 * delta * delta + auto sub_delta_square = CreateBaseOp(graph, + node, + "popart_sub", + {delta_mul_diff, delta_square_coff}, + {}, + {}) + ->outputs.front(); + + // abs(label - input) < delta + auto less_cond = + CreateBaseOp(graph, node, "popart_less", {abs_diff, delta}, {}, {}) + ->outputs.front(); + auto loss = CreateBaseOp( + graph, + node, + "popart_where", + {less_cond, dot_five_square, sub_delta_square}, + append_identity_loss ? std::vector{} + : std::vector{GetOutputVarNode("Out", node)}, + {}); + + if (append_identity_loss) { + loss = CreateIdentityLossOp( + graph, node, loss->outputs, {GetOutputVarNode("Out", node)}, reduction); + } + return loss; +} + +Node *warpctc_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto logits = GetInputVarNode("Logits", node); + auto label = GetInputVarNode("Label", node); + auto logits_length = GetInputVarNode("LogitsLength", node); + auto label_length = GetInputVarNode("LabelLength", node); + auto blank = BOOST_GET_CONST(int, op->GetAttr("blank")); + auto norm_by_times = BOOST_GET_CONST(bool, op->GetAttr("norm_by_times")); + int reduction = 2; + if (is_dynamic_graph()) { + reduction = RemoveTailReduction(graph, node, "Loss"); + } + bool append_identity_loss = + is_dynamic_graph() && IsLastVarNode(GetOutputVarNode("Loss", node)); + if (norm_by_times) { + PADDLE_THROW(platform::errors::InvalidArgument( + "norm_by_times is not supported yet in IPU")); + } + + int axis = -1; + auto softmax_logits = + CreateSoftmaxOpset11(graph, node, {logits}, {}, axis)->outputs.front(); + auto log_softmax_logits = + CreateBaseOp(graph, node, "popart_log", {softmax_logits}, {}, {}) + ->outputs.front(); + auto cast_label = CreateBaseOp(graph, + node, + "popart_cast", + {label}, + {}, + {{"to", std::string("UINT32")}}) + ->outputs.front(); + auto cast_logits_length = CreateBaseOp(graph, + node, + "popart_cast", + {logits_length}, + {}, + {{"to", std::string("UINT32")}}) + ->outputs.front(); + auto cast_label_length = CreateBaseOp(graph, + node, + "popart_cast", + {label_length}, + {}, + {{"to", std::string("UINT32")}}) + ->outputs.front(); + // TODO(czr): zero_infinity is not supported in current sdk which lead + // difference with paddle result. + auto loss = CreateBaseOp( + graph, + node, + "popart_ctcloss", + {log_softmax_logits, cast_label, cast_logits_length, cast_label_length}, + append_identity_loss + ? std::vector{} + : std::vector{GetOutputVarNode("Loss", node)}, + {{"blank", blank}, + {"reduction", reduction}, + {"outDataType", std::string("UNDEFINED")}}); + if (append_identity_loss) { + loss = CreateIdentityLossOp( + graph, node, loss->outputs, {GetOutputVarNode("Loss", node)}, 2); + } + return loss; +} + +} // namespace +} // namespace ipu +} // namespace platform +} // namespace paddle + +REGISTER_HANDLER(identity_loss, identity_loss_handler); +REGISTER_HANDLER(softmax_with_cross_entropy, + softmax_with_cross_entropy_handler); +REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler); +REGISTER_HANDLER(kldiv_loss, kldiv_loss_handler); +REGISTER_HANDLER(bce_loss, binary_cross_entropy_handler); +REGISTER_HANDLER(huber_loss, huber_loss_handler); +REGISTER_HANDLER(warpctc, warpctc_handler); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc index e47a723125b76..ddd7d9453cfa5 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc @@ -114,14 +114,29 @@ Node *matmul_handler(Graph *graph, Node *node) { auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("transpose_X")); auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("transpose_Y")); auto alpha = BOOST_GET_CONST(float, op->GetAttr("alpha")); - auto x_shape = GetInputVarNode("X", node)->Var()->GetShape(); - auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape(); + Node *x_node = GetInputVarNode("X", node); + Node *y_node = GetInputVarNode("Y", node); + int x_rank = x_node->Var()->GetShape().size(); + int y_rank = y_node->Var()->GetShape().size(); + + auto gen_perm = [](const int rank) -> std::vector { + std::vector perm; + if (rank == 1) { + perm = std::vector{0}; + } else if (rank == 2) { + perm = std::vector{1, 0}; + } else if (rank == 3) { + perm = std::vector{0, 2, 1}; + } else if (rank == 4) { + perm = std::vector{0, 1, 3, 2}; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "op matmul with input rank == %d", rank)); + } + return perm; + }; - int x_rank = x_shape.size(); - std::vector perm; - if (x_rank == 1) { - perm = std::vector{0}; - } else if (x_rank == 2) { + if (x_rank == 2) { if (!transpose_x && !transpose_y && is_float_equal(alpha, 1.0f)) { return CreateBaseOp( graph, @@ -137,18 +152,10 @@ Node *matmul_handler(Graph *graph, Node *node) { transpose_x, transpose_y, alpha); - } else if (x_rank == 3) { - perm = std::vector{0, 2, 1}; - } else if (x_rank == 4) { - perm = std::vector{0, 1, 3, 2}; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "op matmul with input rank == %d", x_rank)); } - Node *x_node = GetInputVarNode("X", node); - Node *y_node = GetInputVarNode("Y", node); if (transpose_x) { + auto perm = gen_perm(x_rank); x_node = CreateBaseOp(graph, node, "popart_transpose", @@ -158,6 +165,7 @@ Node *matmul_handler(Graph *graph, Node *node) { x_node = x_node->outputs[0]; } if (transpose_y) { + auto perm = gen_perm(y_rank); y_node = CreateBaseOp(graph, node, "popart_transpose", @@ -209,7 +217,7 @@ Node *scale_handler(Graph *graph, Node *node) { CreateCast(graph, node, {GetInputVarNode("X", node)}, {}, VarType::FP32); Node *result = nullptr; - if (!op->Input("ScaleTensor").empty()) { + if (op->InputArgumentNames().size() > 1) { auto scale = GetInputVarNode("ScaleTensor", node); if (is_float_equal(bias_, 0.0)) { result = CreateBaseOp( @@ -321,183 +329,6 @@ Node *scale_handler(Graph *graph, Node *node) { return result_after_cast; } -Node *cross_entropy2_handler(Graph *graph, Node *node) { - auto *op = node->Op(); - auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index")); - Node *new_cast = nullptr; - if (GetInputVarNode("Label", node)->Var()->GetDataType() == VarType::INT32) { - new_cast = GetInputVarNode("Label", node); - } else { - auto new_cast = CreateCast( - graph, node, {GetInputVarNode("Label", node)}, {}, VarType::INT32); - new_cast = new_cast->outputs[0]; - } - auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape(); - if (label_shape_[label_shape_.size() - 1] != 1) { - auto log = CreateBaseOp( - graph, node, "popart_log", {GetInputVarNode("X", node)}, {}, {}); - return CreateBaseOp( - graph, - node, - "popart_nllloss_v2", - {log->outputs[0], new_cast}, - {GetOutputVarNode("Y", node)}, - { - {"reduction", 2}, // popart::ReductionType::NoReduction - {"ignoreIndex", ignoreIndex}, - {"inputIsLogProbability", true}, - }); - } else { - std::vector new_shape_{label_shape_[0]}; - auto const_before_loss = CreateBaseOp( - graph, - node, - "popart_constant", - {}, - {}, - {{"value", new_shape_}, - {"dims", - std::vector{static_cast(new_shape_.size())}}, - {"dtype", ONNXDataType::INT64}}); - - auto reshape_before_loss = - CreateBaseOp(graph, - node, - "popart_reshape", - {new_cast, const_before_loss->outputs[0]}, - {}, - {}); - - auto log = CreateBaseOp( - graph, node, "popart_log", {GetInputVarNode("X", node)}, {}, {}); - auto nllloss = CreateBaseOp( - graph, - node, - "popart_nllloss_v2", - {log->outputs[0], reshape_before_loss->outputs[0]}, - {}, - { - {"reduction", 2}, // popart::ReductionType::NoReduction - {"ignoreIndex", ignoreIndex}, - {"inputIsLogProbability", true}, - }); - - auto const_after_loss = CreateBaseOp( - graph, - node, - "popart_constant", - {}, - {}, - {{"value", label_shape_}, - {"dims", - std::vector{static_cast(label_shape_.size())}}, - {"dtype", ONNXDataType::INT64}}); - - auto reshape_after_loss = - CreateBaseOp(graph, - node, - "popart_reshape", - {nllloss->outputs[0], const_after_loss->outputs[0]}, - {GetOutputVarNode("Y", node)}, - {}); - return reshape_after_loss; - } -} - -Node *softmax_with_cross_entropy_handler(Graph *graph, Node *node) { - auto *op = node->Op(); - auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index")); - auto axis = BOOST_GET_CONST(int, op->GetAttr("axis")); - auto soft_label = BOOST_GET_CONST(bool, op->GetAttr("soft_label")); - if (soft_label) { - PADDLE_THROW(platform::errors::InvalidArgument( - "soft_label is not supported yet in IPU")); - } - Node *new_cast = nullptr; - if (GetInputVarNode("Label", node)->Var()->GetDataType() == VarType::INT32) { - new_cast = GetInputVarNode("Label", node); - } else { - auto new_cast = CreateCast( - graph, node, {GetInputVarNode("Label", node)}, {}, VarType::INT32); - new_cast = new_cast->outputs[0]; - } - auto softmax_node = CreateSoftmaxOpset11( - graph, node, {GetInputVarNode("Logits", node)}, {}, axis); - - auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape(); - if (label_shape_[label_shape_.size() - 1] != 1) { - auto log = CreateBaseOp( - graph, node, "popart_log", {softmax_node->outputs[0]}, {}, {}); - // softmax_with_cross_entropy is split to several ops in python. - // reduction is not needed here. - return CreateBaseOp( - graph, - node, - "popart_nllloss_v2", - {log->outputs[0], new_cast}, - {GetOutputVarNode("Loss", node)}, - { - {"reduction", 2}, // popart::ReductionType::NoReduction - {"ignoreIndex", ignoreIndex}, - {"inputIsLogProbability", true}, - }); - } else { - std::vector new_shape_{label_shape_[0]}; - auto const_before_loss = CreateBaseOp( - graph, - node, - "popart_constant", - {}, - {}, - {{"value", new_shape_}, - {"dims", - std::vector{static_cast(new_shape_.size())}}, - {"dtype", ONNXDataType::INT64}}); - - auto reshape_before_loss = - CreateBaseOp(graph, - node, - "popart_reshape", - {new_cast, const_before_loss->outputs[0]}, - {}, - {}); - - auto log = CreateBaseOp( - graph, node, "popart_log", {softmax_node->outputs[0]}, {}, {}); - auto nllloss = CreateBaseOp( - graph, - node, - "popart_nllloss_v2", - {log->outputs[0], reshape_before_loss->outputs[0]}, - {}, - { - {"reduction", 2}, // popart::ReductionType::NoReduction - {"ignoreIndex", ignoreIndex}, - {"inputIsLogProbability", true}, - }); - - auto const_after_loss = CreateBaseOp( - graph, - node, - "popart_constant", - {}, - {}, - {{"value", label_shape_}, - {"dims", - std::vector{static_cast(label_shape_.size())}}, - {"dtype", ONNXDataType::INT64}}); - - auto reshape_after_loss = - CreateBaseOp(graph, - node, - "popart_reshape", - {nllloss->outputs[0], const_after_loss->outputs[0]}, - {GetOutputVarNode("Loss", node)}, - {}); - return reshape_after_loss; - } -} - Node *cumsum_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto exclusive = BOOST_GET_CONST(bool, op->GetAttr("exclusive")); @@ -512,41 +343,63 @@ Node *cumsum_handler(Graph *graph, Node *node) { {{"value", std::vector{axis}}, {"dims", std::vector{1}}, {"dtype", ONNXDataType::INT64}}); - return CreateBaseOp( + Node *input_x = nullptr; + auto data_type_ = GetInputVarNode("X", node)->Var()->GetDataType(); + bool need_cast = data_type_ != VarType::FP32; + std::vector cumsum_out; + if (need_cast) { + auto cast_x = CreateCast( + graph, node, {GetInputVarNode("X", node)}, {}, VarType::FP32); + input_x = cast_x->outputs[0]; + } else { + input_x = GetInputVarNode("X", node); + cumsum_out.emplace_back(GetOutputVarNode("Out", node)); + } + auto cumsum_node = CreateBaseOp( graph, node, "popart_cumsum", - {GetInputVarNode("X", node), axis_node->outputs[0]}, - {GetOutputVarNode("Out", node)}, + {input_x, axis_node->outputs[0]}, + cumsum_out, {{"exclusive", popart_exclusive}, {"reverse", popart_reverse}}); + if (need_cast) { + cumsum_node = CreateCast(graph, + node, + cumsum_node->outputs, + {GetOutputVarNode("Out", node)}, + data_type_); + } + return cumsum_node; } Node *matmul_v2_handler(Graph *graph, Node *node) { auto *op = node->Op(); auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("trans_x")); auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("trans_y")); - auto x_shape = GetInputVarNode("X", node)->Var()->GetShape(); - auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape(); - - std::vector perm; - int x_rank = x_shape.size(); - if (x_rank == 1) { - perm = std::vector{0}; - } else if (x_rank == 2) { - perm = std::vector{1, 0}; - } else if (x_rank == 3) { - perm = std::vector{0, 2, 1}; - } else if (x_rank == 4) { - perm = std::vector{0, 1, 3, 2}; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "op matmul with input rank == %d", x_rank)); - } - Node *x_node = GetInputVarNode("X", node); Node *y_node = GetInputVarNode("Y", node); + int x_rank = x_node->Var()->GetShape().size(); + int y_rank = y_node->Var()->GetShape().size(); + + auto gen_perm = [](const int rank) -> std::vector { + std::vector perm; + if (rank == 1) { + perm = std::vector{0}; + } else if (rank == 2) { + perm = std::vector{1, 0}; + } else if (rank == 3) { + perm = std::vector{0, 2, 1}; + } else if (rank == 4) { + perm = std::vector{0, 1, 3, 2}; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "op matmul with input rank == %d", rank)); + } + return perm; + }; if (transpose_x) { + auto perm = gen_perm(x_rank); x_node = CreateBaseOp(graph, node, "popart_transpose", @@ -556,6 +409,7 @@ Node *matmul_v2_handler(Graph *graph, Node *node) { x_node = x_node->outputs[0]; } if (transpose_y) { + auto perm = gen_perm(y_rank); y_node = CreateBaseOp(graph, node, "popart_transpose", @@ -611,9 +465,6 @@ REGISTER_HANDLER(matmul, matmul_handler); REGISTER_HANDLER(sum, sum_handler); REGISTER_HANDLER(softmax, softmax_handler); REGISTER_HANDLER(scale, scale_handler); -REGISTER_HANDLER(softmax_with_cross_entropy, - softmax_with_cross_entropy_handler); -REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler); REGISTER_HANDLER(cumsum, cumsum_handler); REGISTER_HANDLER(matmul_v2, matmul_v2_handler); REGISTER_HANDLER(bmm, bmm_handler); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc index 173ea6d4d514e..6badf37d5b334 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc @@ -55,9 +55,20 @@ Node *MakeOpNode(Graph *graph, op_desc->SetType(type); auto op = graph->CreateOpNode(op_desc.get()); + // inputs + std::vector input_names; for (auto *in : inputs) { - ConnectNodes(in, op); + if (in != nullptr) { + ConnectNodes(in, op); + input_names.push_back(in->Name()); + } else { + input_names.push_back(std::string("")); + } } + op->Op()->SetInput("__inputs__", input_names); + + // outputs + std::vector output_names; if (outputs.empty()) { auto var = MakeVarNode(graph, node); ConnectNodes(op, var); @@ -66,14 +77,6 @@ Node *MakeOpNode(Graph *graph, ConnectNodes(op, out); } } - - // i/o - std::vector input_names; - for (auto node : op->inputs) { - input_names.push_back(node->Name()); - } - op->Op()->SetInput("__inputs__", input_names); - std::vector output_names; for (auto node : op->outputs) { output_names.push_back(node->Name()); } @@ -138,6 +141,19 @@ Node *CreateCast(Graph *graph, graph, node, "popart_cast", inputs, outputs, {{"to", to}}); } +Node *CreateIdentityLossOp(Graph *graph, + Node *node, + const std::vector &inputs, + const std::vector &outputs, + int reduction) { + return CreateBaseOp(graph, + node, + "popart_identity_loss", + inputs, + outputs, + {{"reduction", reduction}}); +} + Node *CreateGemm(Graph *graph, Node *node, const std::vector &inputs, diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h index 582b506974f95..3071c2a0b90cf 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h @@ -67,6 +67,12 @@ Node *CreateCast(Graph *graph, const std::vector &outputs, const VarType::Type otype); +Node *CreateIdentityLossOp(Graph *graph, + Node *node, + const std::vector &inputs, + const std::vector &outputs, + int reduction); + Node *CreateGemm(Graph *graph, Node *node, const std::vector &inputs, diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc index 1e9291cf57256..0b95f641695c1 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc @@ -85,17 +85,6 @@ Node *identity_handler(Graph *graph, Node *node) { graph, node, "popart_identity", node->inputs, node->outputs); } -Node *identity_loss_handler(Graph *graph, Node *node) { - auto *op = node->Op(); - auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction")); - return CreateBaseOp(graph, - node, - "popart_identity_loss", - node->inputs, - node->outputs, - {{"reduction", reduction}}); -} - Node *detach_handler(Graph *graph, Node *node) { return CreateBaseOp( graph, node, "popart_detach_v2", node->inputs, node->outputs); @@ -112,5 +101,4 @@ REGISTER_HANDLER(popart_optimizer, popart_optimizer_handler); REGISTER_HANDLER(checkpointoutput, checkpointoutput_handler); REGISTER_HANDLER(custom_nll_loss, custom_nll_loss_handler); REGISTER_HANDLER(identity, identity_handler); -REGISTER_HANDLER(identity_loss, identity_loss_handler); REGISTER_HANDLER(detach, detach_handler); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/reduce_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/reduce_ops.cc index 852cb180aa787..e1cc2de8bc547 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/reduce_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/reduce_ops.cc @@ -36,6 +36,27 @@ Node *reduce_op_handler(Graph *graph, Node *node, const std::string &op_name) { return CreateBaseOp(graph, node, op_name, node->inputs, node->outputs, attrs); } +Node *reduce_all_op_handler(Graph *graph, + Node *node, + const std::string &op_name) { + auto *op = node->Op(); + auto attrs = AttributeMap{}; + auto reduce_all = BOOST_GET_CONST(bool, op->GetAttr("reduce_all")); + if (!reduce_all) { + auto axes_ = BOOST_GET_CONST(std::vector, op->GetAttr("dim")); + auto axes = std::vector{axes_.begin(), axes_.end()}; + attrs.emplace("axes", axes); + } + auto keepdims_ = BOOST_GET_CONST(bool, op->GetAttr("keep_dim")); + auto keepdims = int64_t{keepdims_}; + attrs.emplace("keepdims", keepdims); + auto int32_x = + CreateCast(graph, node, node->inputs, {}, VarType::INT32)->outputs[0]; + auto reduce_op = CreateBaseOp(graph, node, op_name, {int32_x}, {}, attrs); + return CreateCast( + graph, node, reduce_op->outputs, node->outputs, VarType::BOOL); +} + Node *reduce_mean_handler(Graph *graph, Node *node) { return reduce_op_handler(graph, node, "popart_reducemean"); } @@ -56,6 +77,34 @@ Node *reduce_prod_handler(Graph *graph, Node *node) { return reduce_op_handler(graph, node, "popart_reduceprod"); } +Node *logsumexp_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto attrs = AttributeMap{}; + auto reduce_all = BOOST_GET_CONST(bool, op->GetAttr("reduce_all")); + if (!reduce_all) { + auto axes_ = BOOST_GET_CONST(std::vector, op->GetAttr("axis")); + auto axes = std::vector{axes_.begin(), axes_.end()}; + attrs.emplace("axes", axes); + } + auto keepdims_ = BOOST_GET_CONST(bool, op->GetAttr("keepdim")); + auto keepdims = int64_t{keepdims_}; + attrs.emplace("keepdims", keepdims); + return CreateBaseOp(graph, + node, + "popart_reducelogsumexp", + node->inputs, + node->outputs, + attrs); +} + +Node *reduce_all_handler(Graph *graph, Node *node) { + return reduce_all_op_handler(graph, node, "popart_reducemin"); +} + +Node *reduce_any_handler(Graph *graph, Node *node) { + return reduce_all_op_handler(graph, node, "popart_reducemax"); +} + } // namespace } // namespace ipu } // namespace platform @@ -66,3 +115,6 @@ REGISTER_HANDLER(reduce_min, reduce_min_handler); REGISTER_HANDLER(reduce_sum, reduce_sum_handler); REGISTER_HANDLER(reduce_max, reduce_max_handler); REGISTER_HANDLER(reduce_prod, reduce_prod_handler); +REGISTER_HANDLER(logsumexp, logsumexp_handler); +REGISTER_HANDLER(reduce_all, reduce_all_handler); +REGISTER_HANDLER(reduce_any, reduce_any_handler); diff --git a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h index 763c5a46abe28..14dcf65afeefd 100644 --- a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h +++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h @@ -33,6 +33,7 @@ OP_DECL(popart_dynamicadd_v2, aiGraphcoreOpset.dynamicadd, ARG(INT_VEC,axes) ARG OP_DECL(popart_sequenceslice_v2, aiGraphcoreOpset.sequenceslice, ARG(INT,zeroUnused) ) // NOLINT OP_DECL(popart_replicatedallreduce_v2, aiGraphcoreOpset.replicatedallreduce, OPT_ARG(INT_VEC,commGroup) ) // NOLINT OP_DECL(popart_ctcbeamsearchdecoder_v2, aiGraphcoreOpset.ctcbeamsearchdecoder, ARG(INT,blank) ARG(INT,beamWidth) ARG(INT,topPaths) ) // NOLINT +OP_DECL(popart_ctcloss, aiGraphcoreOpset.ctcloss, SIG_ARG(INT32,popart::ReductionType,reduction) ARG(INT32,blank) ARG(STRING,outDataType) ) // NOLINT OP_DECL(popart_shapeddropout_v2, aiGraphcoreOpset.shapeddropout, ARG(INT_VEC,shape) ARG(FLOAT,ratio) ) // NOLINT OP_DECL(popart_atan2_v2, aiGraphcoreOpset.atan2, NONE) // NOLINT OP_DECL(popart_expm1_v2, aiGraphcoreOpset.expm1, NONE) // NOLINT From 5988553f3cd4a864ddb3b9ca3d92ff26eb7923f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Mon, 11 Jul 2022 20:33:23 +0800 Subject: [PATCH 129/250] [NPU] add npu support for new executor. test=develop (#43403) --- .../framework/new_executor/data_transfer.cc | 7 ++ .../framework/new_executor/interpretercore.cc | 23 +++- .../new_executor/interpretercore_util.cc | 61 ++++++--- .../new_executor/new_executor_defs.h | 3 +- .../framework/new_executor/stream_analyzer.cc | 73 +++++++---- .../framework/new_executor/stream_analyzer.h | 6 +- .../memory/allocation/allocator_facade.cc | 6 +- paddle/fluid/operators/crop_op_npu.cc | 6 +- paddle/fluid/operators/memcpy_h2d_op.cc | 15 ++- paddle/fluid/platform/CMakeLists.txt | 10 ++ paddle/fluid/platform/device/npu/npu_info.cc | 4 + paddle/fluid/platform/device/npu/npu_info.h | 3 + paddle/fluid/platform/device_context.cc | 2 + paddle/fluid/platform/device_event.h | 7 ++ paddle/fluid/platform/device_event_base.h | 2 +- paddle/fluid/platform/device_event_npu.cc | 117 ++++++++++++++++++ python/paddle/fluid/executor.py | 5 +- 17 files changed, 281 insertions(+), 69 deletions(-) create mode 100644 paddle/fluid/platform/device_event_npu.cc diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index 701f0a430aa5c..b856bbec4b0c4 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -137,6 +137,13 @@ void DataTranferHelper::RunAndConstructOpFuncNode( new_op_func_node.output_index["Out"] = {var_scope_->VarId(new_var_name)}; new_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second); new_op_func_node.kernel_func_(exec_ctx); + // NOTE(winter-wang): in npu device, D2H kernel is asynchronous. need to + // explicit synchronization. +#ifdef PADDLE_WITH_ASCEND_CL + if (op_type == kMemcpyD2H) { + dev_ctx->Wait(); + } +#endif // NOTE(Aurelius84): data_transform_op is expensive operation, so we tag them // as kQueueSync and execute them in thread pool. new_op_func_node.type_ = OpFuncType::kQueueSync; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 3c66eb0c4613c..c321069537c89 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -90,6 +90,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place, auto local_scope = &var_scope_.GetMutableScope()->NewScope(); local_scope_ = local_scope; } + var_scope_.SetLocalScope(local_scope_); // prune @@ -115,7 +116,6 @@ InterpreterCore::~InterpreterCore() { interpreter::CostInfo InterpreterCore::DryRun( const std::vector& feed_names, const std::vector& feed_tensors) { - var_scope_.SetLocalScope(local_scope_); Prepare(feed_names, feed_tensors, true); interpreter::CostInfo cost_info; { @@ -144,7 +144,6 @@ paddle::framework::FetchList InterpreterCore::Run( platform::AttachPointerHashToMKLDNNKey(this, place_); #endif bool is_build = is_build_; - var_scope_.SetLocalScope(local_scope_); Prepare(feed_names, feed_tensors, is_build); if (is_build) { @@ -153,8 +152,10 @@ paddle::framework::FetchList InterpreterCore::Run( // until the second step run. async_work_queue_ = GetWorkQueue(); ExecuteInstructionList(vec_instruction_); +#ifdef PADDLE_WITH_ASCEND_CL + platform::DeviceContextPool::Instance().Get(place_)->Wait(); +#endif } - if (create_local_scope_) { ClearLoDTensorArrayInLocalScope(); } @@ -174,7 +175,6 @@ paddle::framework::FetchList InterpreterCore::Run( platform::AttachPointerHashToMKLDNNKey(this, place_); #endif if (!is_build_) { - var_scope_.SetLocalScope(local_scope_); paddle::framework::interpreter::build_variable_scope(block_, &var_scope_); std::vector op_func_nodes; @@ -196,12 +196,14 @@ paddle::framework::FetchList InterpreterCore::Run( async_work_queue_ = GetWorkQueue(); ExecuteInstructionList(vec_instruction_); +#ifdef PADDLE_WITH_ASCEND_CL + platform::DeviceContextPool::Instance().Get(place_)->Wait(); +#endif } if (create_local_scope_) { ClearLoDTensorArrayInLocalScope(); } - // return Fetch Tensors auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName); if (fetch_var) { @@ -528,6 +530,17 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_); Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope() : var_scope_.GetMutableScope(); + +#ifdef PADDLE_WITH_ASCEND_CL + // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable + // values, but only through special `float_status` to checks whether + // the operation is overflow. More about `float_status`, see: + // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue + if (FLAGS_check_nan_inf) { + framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place); + } +#endif + auto op_with_kernel = dynamic_cast(op); { // If it is OperatorBase, InferShape do nothing. diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 1a539c1ce1cea..acbcf1da4c5e3 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -15,6 +15,7 @@ #include +#include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/new_executor/data_transfer.h" #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" @@ -43,6 +44,7 @@ PADDLE_DEFINE_EXPORTED_bool( "Enable serial execution for standalone executor, used for debug."); DECLARE_bool(use_mkldnn); +DECLARE_bool(check_nan_inf); namespace paddle { namespace framework { @@ -446,11 +448,19 @@ void build_op_func_list(const platform::Place& place, op_func_node.output_index = outs_name2id; VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope); +#ifdef PADDLE_WITH_ASCEND_CL + // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable + // values, but only through special `float_status` to checks whether + // the operation is overflow. More about `float_status`, see: + // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue + if (FLAGS_check_nan_inf) { + framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place); + } +#endif + if (dynamic_cast(op) == nullptr) { // op is not a operatorwithkernel, so direcly run OperatorBase::Run() deal_operator_base(place, var_scope, ops[i], &op_func_node, local_scope); - VLOG(4) << "End run " << place << " " - << op_func_node.operator_base_->DebugStringEx(local_scope); } else { auto op_with_kernel = const_cast( static_cast(op)); @@ -593,6 +603,12 @@ void build_op_func_list(const platform::Place& place, << var_scope->GetNameById(p.second); } } + + // for debug nan/inf + if (FLAGS_check_nan_inf) { + VLOG(4) << "Check nan/inf"; + framework::details::CheckOpHasNanOrInf(*op, *runtime_scope, place); + } } VLOG(4) << "End run " << place << " " @@ -768,12 +784,7 @@ void ShrinkDownstreamMap(std::map>* downstream_map, // b: c // happens_before[i][j] means i should be executed before j - op_happens_before->resize(op_num); - for (size_t i = 0; i < op_num; ++i) { - (*op_happens_before)[i].resize(op_num); - std::fill( - (*op_happens_before)[i].begin(), (*op_happens_before)[i].end(), false); - } + op_happens_before->assign(op_num, std::vector(op_num, false)); // bfs to get all next ops auto bfs = [&](size_t op_idx) { @@ -883,6 +894,18 @@ std::map> build_op_downstream_map( } } } + // the original output of inplace op is also change. + if (!vec_instruction[op_idx].InplaceBackMap().empty()) { + auto& m = vec_instruction[op_idx].InplaceBackMap(); + for (auto& p : m) { + auto& var = p.second; + if (var2min_rw_op.count(var)) { + for (auto dep_op : var2min_rw_op[var]) { + op2dependences[op_idx].insert(dep_op); + } + } + } + } // step2: update 2 var2xxxx data structure for (auto& item : @@ -894,16 +917,6 @@ std::map> build_op_downstream_map( } } - for (auto& item : - vec_instruction[op_idx].Inputs()) { // for all inputs(read only) - for (auto var : item.second) { - if (remove_duplicate.count(var) == - 0) { // var in input list and in output list, so remove it. - update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var); - } - } - } - // NOTE(zhiqiu): The inplace op with `transfer` also changes // original output after that so add original output as well // original: a->op->a @@ -914,8 +927,16 @@ std::map> build_op_downstream_map( for (auto& p : m) { auto var = p.second; var2recent_write_op[var] = op_idx; - // var in input list and in output list, so remove it. - if (remove_duplicate.count(var) == 0) { + var2min_rw_op[var] = {static_cast(op_idx)}; + remove_duplicate.insert(var); + } + } + + for (auto& item : + vec_instruction[op_idx].Inputs()) { // for all inputs(read only) + for (auto var : item.second) { + if (remove_duplicate.count(var) == + 0) { // var in input list and in output list, so remove it. update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var); } } diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 70a92f0ae28ae..af3951f4538f1 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -389,7 +389,8 @@ static bool IsCpuOp(const Instruction& instr) { // is supported heterogeneous place static bool IsSupportedHetePlace(const phi::Place& place) { - return platform::is_gpu_place(place) || platform::is_xpu_place(place); + return platform::is_gpu_place(place) || platform::is_npu_place(place) || + platform::is_xpu_place(place); } } // namespace interpreter diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc index b7a7e4c0b546f..086dac8dac1fb 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc @@ -21,23 +21,37 @@ namespace paddle { namespace framework { +namespace { +std::map>>* + d2h_ctxs = nullptr; +std::map>>* + h2d_ctxs = nullptr; +std::mutex ctx_mtx; +} // namespace StreamAnalyzer::StreamAnalyzer(const platform::Place& place) : place_(place) { - if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::EmplaceDeviceContexts( - &d2h_ctxs_, - {place}, - /*disable_setting_default_stream_for_allocator=*/true); - platform::EmplaceDeviceContexts( - &h2d_ctxs_, - {place}, - /*disable_setting_default_stream_for_allocator=*/true); -#else - PADDLE_THROW( - platform::errors::Unimplemented("CUDAPlace is not supported. Please " - "re-compile with WITH_GPU option.")); -#endif + if (platform::is_gpu_place(place) || platform::is_npu_place(place)) { + std::lock_guard lk(ctx_mtx); + if (d2h_ctxs == nullptr) { + d2h_ctxs = new std::map< + Place, + std::shared_future>>(); + h2d_ctxs = new std::map< + Place, + std::shared_future>>(); + } + if (d2h_ctxs->find(place) == d2h_ctxs->end()) { + platform::EmplaceDeviceContexts( + d2h_ctxs, + {place}, + /*disable_setting_default_stream_for_allocator=*/true); + platform::EmplaceDeviceContexts( + h2d_ctxs, + {place}, + /*disable_setting_default_stream_for_allocator=*/true); + } + d2h_ctx_ = (*d2h_ctxs)[place]; + h2d_ctx_ = (*h2d_ctxs)[place]; } } @@ -162,15 +176,15 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext( const OpFuncNode& op_func_node) { auto& op_type = op_func_node.operator_base_->Type(); auto* dev_ctx = op_func_node.dev_ctx_; - // only gpu need update. xpu not need, because xpu memcpy op kernel is + // only gpu/npu need update. xpu not need, because xpu memcpy op kernel is // synchronous. - if (platform::is_gpu_place(place_)) { + if (platform::is_gpu_place(place_) || platform::is_npu_place(place_)) { if (op_type == interpreter::kMemcpyD2H) { VLOG(3) << "Get dev_ctx from d2h_context_pool_"; - dev_ctx = d2h_ctxs_[place_].get().get(); + dev_ctx = d2h_ctx_.get().get(); } else if (op_type == interpreter::kMemcpyH2D) { VLOG(3) << "Get dev_ctx from h2d_context_pool_"; - dev_ctx = h2d_ctxs_[place_].get().get(); + dev_ctx = h2d_ctx_.get().get(); } } return dev_ctx; @@ -188,11 +202,20 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext( */ bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr, const Instruction& next_instr) { - return platform::is_xpu_place(place_) || - (&cur_instr.DeviceContext() == &next_instr.DeviceContext() || - interpreter::IsCpuOp(cur_instr) || - interpreter::IsMemcpyD2H(cur_instr) || - interpreter::IsMemcpyH2D(next_instr)); + if (&cur_instr.DeviceContext() == &next_instr.DeviceContext()) return true; + + // xpu memcpy kerenl is synchronous. + if (platform::is_xpu_place(place_)) return true; + + // npu d2h kernel is asynchronous. + if (platform::is_npu_place(place_)) { + return interpreter::IsCpuOp(cur_instr) || + interpreter::IsMemcpyH2D(next_instr); + } + // gpu or cpu + return interpreter::IsCpuOp(cur_instr) || + interpreter::IsMemcpyD2H(cur_instr) || + interpreter::IsMemcpyH2D(next_instr); } platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) { @@ -201,6 +224,8 @@ platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) { } else { if (platform::is_xpu_place(place_)) { return platform::kXPU; + } else if (platform::is_npu_place(place_)) { + return platform::kNPU; } return platform::kCUDA; } diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.h b/paddle/fluid/framework/new_executor/stream_analyzer.h index 61e37bbb686fc..4be8ffe6bb4ca 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.h +++ b/paddle/fluid/framework/new_executor/stream_analyzer.h @@ -53,9 +53,9 @@ class StreamAnalyzer { platform::DeviceType GetWaiterType(const Instruction& instr); - Place place_; - std::map>> d2h_ctxs_; - std::map>> h2d_ctxs_; + const Place place_; + std::shared_future> d2h_ctx_; + std::shared_future> h2d_ctx_; std::map> var_id2event_; }; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 4364934a4027d..917cebc11f9a9 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -1080,11 +1080,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, } else { return m->GetAllocator(p, size)->Allocate(size); } -#elif defined PADDLE_WITH_XPU +#elif defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) return GetAllocator(place)->Allocate(size); #else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with GPU or XPU.")); + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Not compiled with GPU or XPU or NPU.")); #endif } diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc index 6c4c6eb25d820..bd50dea15f80e 100644 --- a/paddle/fluid/operators/crop_op_npu.cc +++ b/paddle/fluid/operators/crop_op_npu.cc @@ -70,8 +70,12 @@ class CropNPUKernel : public framework::OpKernel { shape->dims().size(), x->dims().size())); + // shape memory maybe have gc. + Tensor tmp_shape(*shape); + tmp_shape.mutable_data(ctx.GetPlace()); + const auto& runner = - NpuOpRunner("Crop", {*x, *shape}, {*out}, attr_input); + NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input); auto stream = ctx.template device_context() .stream(); diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc index 411841c4502fa..98ed68cf84f87 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.cc +++ b/paddle/fluid/operators/memcpy_h2d_op.cc @@ -94,14 +94,13 @@ class MemcpyH2DOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(LoDTensor) The type of output " "is the same as input X."); - AddAttr( - "dst_place_type", - "Determine the dst place of tensor copy. " - "By Now it ONLY support CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace " - "Other place type is Unimplemented and will cause ERROR." - "0: dst is on CUDAPlace. " - "1: dst is on NPUPlace. " - "2: dst is on XPUPlace. "); + AddAttr("dst_place_type", + "Determine the dst place of tensor copy. " + "By Now it support:" + "0. CUDAPinnedPlace/CPU <->CUDAPlace" + "1. NPUPinnedPlace/CPU <-> NPUPlace" + "2. CPU <->XPUPlace" + "Other place type is Unimplemented and will cause ERROR."); AddComment(R"DOC( MemcpyD2H Operator. By now, it ONLY supports the memcopy between CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace. diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index efe0479871215..b00e4056259d9 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -280,6 +280,16 @@ if(WITH_XPU) CACHE INTERNAL "device event libs") endif() +if(WITH_ASCEND_CL) + cc_library( + device_event_npu + SRCS device_event_npu.cc + DEPS device_event_base npu_resource_pool) + set(DEVICE_EVENT_LIBS + device_event_npu + CACHE INTERNAL "device event libs") +endif() + if(WITH_GPU) nv_library( device_event_gpu diff --git a/paddle/fluid/platform/device/npu/npu_info.cc b/paddle/fluid/platform/device/npu/npu_info.cc index 362c4e8fae8b1..9acdef985ade2 100644 --- a/paddle/fluid/platform/device/npu/npu_info.cc +++ b/paddle/fluid/platform/device/npu/npu_info.cc @@ -285,6 +285,10 @@ void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status) { PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, status)); } +void NPUEventSynchronize(aclrtEvent event) { + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeEvent(event)); +} + void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event) { PADDLE_ENFORCE_NPU_SUCCESS(aclrtStreamWaitEvent(stream, event)); } diff --git a/paddle/fluid/platform/device/npu/npu_info.h b/paddle/fluid/platform/device/npu/npu_info.h index f7af1c246ef6c..ea55831db2e22 100644 --- a/paddle/fluid/platform/device/npu/npu_info.h +++ b/paddle/fluid/platform/device/npu/npu_info.h @@ -138,6 +138,9 @@ void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status); //! Record NPU event in the stream. void NPUEventRecord(aclrtEvent event, aclrtStream stream); +//! Synchronize NPU event. +void NPUEventSynchronize(aclrtEvent event); + //! Makes a stream wait on an event. void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index a668d7f4b8366..6bceb696c0f8e 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -125,6 +125,8 @@ DeviceType Place2DeviceType(const platform::Place& place) { return platform::DeviceType::XPU; } else if (platform::is_ipu_place(place)) { return platform::DeviceType::IPU; + } else if (platform::is_npu_place(place)) { + return platform::DeviceType::NPU; } else if (platform::is_mlu_place(place)) { return platform::DeviceType::MLU; } else { diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index 1fd116600624c..2edccfa90c939 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -25,6 +25,7 @@ using ::paddle::platform::kCPU; using ::paddle::platform::kCUDA; +using ::paddle::platform::kNPU; using ::paddle::platform::kXPU; USE_EVENT(kCPU) @@ -41,3 +42,9 @@ USE_EVENT(kXPU); USE_EVENT_WAIT(kXPU, kXPU) USE_EVENT_WAIT(kCPU, kXPU) #endif + +#ifdef PADDLE_WITH_ASCEND_CL +USE_EVENT(kNPU); +USE_EVENT_WAIT(kNPU, kNPU) +USE_EVENT_WAIT(kCPU, kNPU) +#endif diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h index b42721a60d974..51df0fd4f40ad 100644 --- a/paddle/fluid/platform/device_event_base.h +++ b/paddle/fluid/platform/device_event_base.h @@ -66,7 +66,7 @@ class DeviceEvent { type_id_)); // TODO(Aurelius84): only support CPU/CUDA, need consider XPU/NPU later PADDLE_ENFORCE_LT(type_id_, - 3, + 4, platform::errors::Unavailable( "Currently DeviceEvent do not support %s", place)); PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/platform/device_event_npu.cc b/paddle/fluid/platform/device_event_npu.cc new file mode 100644 index 0000000000000..215f308f66348 --- /dev/null +++ b/paddle/fluid/platform/device_event_npu.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_ASCEND_CL + +#include "paddle/fluid/platform/device/npu/npu_resource_pool.h" +#include "paddle/fluid/platform/device_event_base.h" +#include "paddle/fluid/platform/event.h" +namespace paddle { +namespace platform { +struct NPUDeviceEventWrapper { + explicit NPUDeviceEventWrapper(const platform::Place& place) { + PADDLE_ENFORCE_EQ( + platform::is_npu_place(place), + true, + platform::errors::PreconditionNotMet( + "Required device shall be NPUPlace, but received %d. ", place)); + + device_id_ = place.device; + PADDLE_ENFORCE_GT( + device_id_, + -1, + platform::errors::PreconditionNotMet( + "Required DeviceOption.device_id > -1, but received %d. ", + device_id_)); + inner_event_ = NpuEventResourcePool::Instance().New(device_id_); + } + std::shared_ptr inner_event_; + int device_id_; +}; + +void DeviceEventCreateNPU(DeviceEvent* event, + const platform::Place& place, + unsigned int) { + event->InitEvent(std::make_shared(place)); +} + +void DeviceEventRecordNPU(DeviceEvent* event, const DeviceContext* context) { + auto* wrapper = static_cast(event->GetEvent().get()); + auto* npu_dev_ctx = dynamic_cast(context); + PADDLE_ENFORCE_NOT_NULL( + npu_dev_ctx, + platform::errors::PreconditionNotMet( + "Failed to dynamic_cast context into NPUDeviceContext.")); + NPUEventRecord(wrapper->inner_event_.get(), npu_dev_ctx->stream()); +} + +bool DeviceEventQueryNPU(const DeviceEvent* event) { + auto* wrapper = static_cast(event->GetEvent().get()); + PADDLE_ENFORCE_NOT_NULL( + wrapper, + platform::errors::PreconditionNotMet( + "Failed to dynamic_cast event into NPUDeviceEventWrapper.")); + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + platform::NPUEventQuery(wrapper->inner_event_.get(), &status); + return ACL_EVENT_STATUS_COMPLETE == status; +} + +void DeviceEventFinishNPU(const DeviceEvent* event) { + auto* wrapper = static_cast(event->GetEvent().get()); + NPUEventSynchronize(wrapper->inner_event_.get()); +} + +void DeviceEventNPUWaitNPU(const DeviceEvent* event, + const DeviceContext* context) { + auto* wrapper = static_cast(event->GetEvent().get()); + auto* npu_dev_ctx = dynamic_cast(context); + PADDLE_ENFORCE_NOT_NULL( + npu_dev_ctx, + platform::errors::PreconditionNotMet( + "Failed to dynamic_cast context into NPUDeviceContext.")); + NPUStreamWaitEvent(npu_dev_ctx->stream(), wrapper->inner_event_.get()); +} + +void DeviceEventCPUWaitNPU(const DeviceEvent* event, + const DeviceContext* context) { + DeviceEventFinishNPU(event); +} + +void DeviceEventSetFinishedNPU(const DeviceEvent* event) { + // do nothing +} + +void EventResetNPU(const DeviceEvent* event) { + // do nothing +} + +} // namespace platform +} // namespace paddle + +using ::paddle::platform::kCPU; +using ::paddle::platform::kNPU; +REGISTER_EVENT_CREATE_FUNCTION(kNPU, paddle::platform::DeviceEventCreateNPU) +REGISTER_EVENT_RECORD_FUNCTION(kNPU, paddle::platform::DeviceEventRecordNPU) +REGISTER_EVENT_QUERY_FUNCTION(kNPU, paddle::platform::DeviceEventQueryNPU) +REGISTER_EVENT_FINISH_FUNCTION(kNPU, paddle::platform::DeviceEventFinishNPU) +REGISTER_EVENT_SET_FINISHED_FUNCTION( + kNPU, paddle::platform::DeviceEventSetFinishedNPU) +REGISTER_EVENT_WAIT_FUNCTION(kNPU, + kNPU, + paddle::platform::DeviceEventNPUWaitNPU) +REGISTER_EVENT_WAIT_FUNCTION(kCPU, + kNPU, + paddle::platform::DeviceEventCPUWaitNPU) +REGISTER_EVENT_RESET_FUNCTION(kNPU, paddle::platform::EventResetNPU) +#endif diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 3303b6c9472ff..5f80e3b757770 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1400,9 +1400,8 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name, program = pruned_program def _can_use_interpreter_core(program, place): - if core.is_compiled_with_npu() or core.is_compiled_with_mlu( - ) or core.is_compiled_with_ipu() or isinstance( - place, core.CustomPlace): + if core.is_compiled_with_mlu() or core.is_compiled_with_ipu( + ) or isinstance(place, core.CustomPlace): return False compiled = isinstance(program, compiler.CompiledProgram) From f1111f3ca8a1a81b3b79dce52ee98569fb0eca4a Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Mon, 11 Jul 2022 20:51:20 +0800 Subject: [PATCH 130/250] [IPU] support more ops 1/N (#44205) * add authors Co-authored-by: Allen Guo Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen * squash cpp changes 2/N Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen --- .../ipu/popart_canonicalization/nn_ops.cc | 470 ++++++++++++++++ .../ipu/popart_canonicalization/tensor_ops.cc | 507 +++++++++++++++++- 2 files changed, 975 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc index 5f0ba745ed3c9..21c9beade3082 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc @@ -376,11 +376,473 @@ Node *dropout_handler(Graph *graph, Node *node) { } } +Node *conv2d_transpose_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + + auto data_format = BOOST_GET_CONST(std::string, op->GetAttr("data_format")); + if (data_format != "NCHW") { + PADDLE_THROW( + platform::errors::InvalidArgument("Only support NCHW as data_format.")); + } + + auto *kernel_info = GetInputVarNode("Filter", node); + auto kernel_shape = kernel_info->Var()->GetShape(); + + auto dilations_ = BOOST_GET_CONST(std::vector, op->GetAttr("dilations")); + auto dilations = std::vector{dilations_.begin(), dilations_.end()}; + auto strides_ = BOOST_GET_CONST(std::vector, op->GetAttr("strides")); + auto strides = std::vector{strides_.begin(), strides_.end()}; + auto output_padding_ = + BOOST_GET_CONST(std::vector, op->GetAttr("output_padding")); + auto output_padding = + std::vector{output_padding_.begin(), output_padding_.end()}; + auto group_ = BOOST_GET_CONST(int, op->GetAttr("groups")); + auto group = int64_t(group_); + + auto padding_algorithm = + BOOST_GET_CONST(std::string, op->GetAttr("padding_algorithm")); + + auto paddings_ = BOOST_GET_CONST(std::vector, op->GetAttr("paddings")); + if (paddings_.size() == 2) { + paddings_.push_back(paddings_[0]); + paddings_.push_back(paddings_[1]); + } else if (paddings_.size() == 4) { + std::swap(paddings_[1], paddings_[2]); + } + auto paddings = std::vector{paddings_.begin(), paddings_.end()}; + + if (padding_algorithm == "SAME") { + // Update paddings and dilations based on the sizes of H and W. + auto input_shape = GetInputVarNode("Input", node)->Var()->GetShape(); + for (auto i = 0; i < 2; i++) { + auto out_size = (input_shape[i + 2] + strides[i] - 1) / strides[i]; + auto pad_sum = std::max( + (out_size - 1) * strides[i] + kernel_shape[i] - input_shape[i + 2], + static_cast(0)); + auto pad_0 = pad_sum / 2; + auto pad_1 = pad_sum - pad_0; + paddings[i] = pad_0; + paddings[i + 2] = pad_1; + } + for (auto i = 0; i < dilations.size(); i++) { + dilations[i] = 1; + } + } else if (padding_algorithm == "VALID") { + for (auto i = 0; i < paddings.size(); i++) { + paddings[i] = 0; + } + } + + auto attrs = AttributeMap{{"dilations", dilations}, + {"group", group}, + {"kernel_shape", kernel_shape}, + {"output_padding", output_padding}, + {"pads", paddings}, + {"strides", strides}}; + if (!op->Input("Bias").empty()) { + return CreateBaseOp(graph, + node, + "popart_convtranspose", + { + GetInputVarNode("Input", node), + GetInputVarNode("Filter", node), + GetInputVarNode("Bias", node), + }, + node->outputs, + attrs); + } else { + return CreateBaseOp(graph, + node, + "popart_convtranspose", + { + GetInputVarNode("Input", node), + GetInputVarNode("Filter", node), + }, + node->outputs, + attrs); + } +} + +Node *affine_channel_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + + auto data_layout = BOOST_GET_CONST(std::string, op->GetAttr("data_layout")); + if (data_layout != "NCHW") { + PADDLE_THROW( + platform::errors::InvalidArgument("Only support NCHW as data_format.")); + } + + auto *scale = GetInputVarNode("Scale", node); + auto *bias = GetInputVarNode("Bias", node); + auto scale_shape = scale->Var()->GetShape(); + auto bias_shape = bias->Var()->GetShape(); + if (scale_shape.size() <= 1 || bias_shape.size() <= 1) { + auto attrs = AttributeMap{{"value", std::vector{1, -1, 1, 1}}, + {"dims", std::vector{4}}, + {"dtype", ONNXDataType::INT64}}; + auto new_shape_const = CreateConst(graph, node, {}, {}, attrs); + + scale = CreateBaseOp(graph, + node, + "popart_reshape", + {scale, new_shape_const->outputs[0]}, + {}, + {}) + ->outputs[0]; + bias = CreateBaseOp(graph, + node, + "popart_reshape", + {bias, new_shape_const->outputs[0]}, + {}, + {}) + ->outputs[0]; + } + auto *out = CreateBaseOp( + graph, node, "popart_mul", {GetInputVarNode("X", node), scale}, {}); + return CreateBaseOp(graph, + node, + "popart_add", + {out->outputs[0], bias}, + {GetOutputVarNode("Out", node)}); +} + +Node *interp_handler(Graph *graph, Node *node, const std::string &mode) { + auto *op = node->Op(); + + auto data_layout = BOOST_GET_CONST(std::string, op->GetAttr("data_layout")); + if (data_layout != "NCHW") { + PADDLE_THROW( + platform::errors::InvalidArgument("Only support NCHW as data_format.")); + } + + auto align_corners = BOOST_GET_CONST(bool, op->GetAttr("align_corners")); + auto align_mode = BOOST_GET_CONST(int, op->GetAttr("align_mode")); + + auto paddle_target_dtype = VarType::FP32; + auto onnx_target_dtype = ONNXDataType::FLOAT; + if (GetInputVarNode("X", node)->Var()->GetDataType() == VarType::FP16) { + paddle_target_dtype = VarType::FP16; + onnx_target_dtype = ONNXDataType::FLOAT16; + } + + std::string coordinate_transformation_mode = "half_pixel"; + if (align_corners) { + coordinate_transformation_mode = "align_corners"; + } else if (mode == "nearest") { + coordinate_transformation_mode = "asymmetric"; + } else if (align_mode == 1 && mode == "cubic") { + coordinate_transformation_mode = "asymmetric"; + } + + bool has_out_size = node->Op()->Input("OutSize").size() > 0; + bool has_size_tensor = node->Op()->Input("SizeTensor").size() > 0; + bool has_scale_tensor = node->Op()->Input("Scale").size() > 0; + + Node *size = nullptr; + Node *scale = nullptr; + // Input: Size and Scale + if (has_out_size) { + // Get 'size' from the tensor + size = GetInputVarNode("OutSize", node); + if (size->Var()->GetDataType() != VarType::INT64) { + size = CreateCast(graph, + node, + {GetInputVarNode("OutSize", node)}, + {}, + VarType::INT64) + ->outputs[0]; + } + } else if (has_size_tensor) { + // Get 'size' from multi-tensors + std::vector size_nodes; + for (auto var_name : node->Op()->Input("SizeTensor")) { + Node *size_node = GetInputVarNodeByVarName(var_name, node); + if (size_node->Var()->GetDataType() != VarType::INT64) { + size_node = CreateCast(graph, node, {size_node}, {}, VarType::INT64) + ->outputs[0]; + } + size_nodes.push_back(size_node); + } + size = CreateBaseOp(graph, + node, + "popart_concat", + size_nodes, + {}, + {{"axis", int64_t(0)}}) + ->outputs[0]; + } else if (has_scale_tensor) { + // Get 'scale' from tensor + scale = GetInputVarNode("Scale", node); + if (scale->Var()->GetDataType() != paddle_target_dtype) { + scale = + CreateCast(graph, node, {scale}, {}, paddle_target_dtype)->outputs[0]; + } + auto *padding = CreateConst(graph, + node, + {}, + {}, + {{"value", std::vector{1.0, 1.0}}, + {"dims", std::vector{2}}, + {"dtype", onnx_target_dtype}}) + ->outputs[0]; + scale = CreateBaseOp(graph, + node, + "popart_concat", + {padding, scale}, + {}, + {{"axis", int64_t(0)}}) + ->outputs[0]; + } else { + // Get 'size' or 'scale' from attribute + auto out_d = BOOST_GET_CONST(int, op->GetAttr("out_d")); + auto out_h = BOOST_GET_CONST(int, op->GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, op->GetAttr("out_w")); + if (out_d > 0 || out_w > 0 || out_h > 0) { + std::vector out_size; + if (GetInputVarNode("X", node)->Var()->GetShape().size() == 5) { + out_size.push_back(int64_t(out_d)); + out_size.push_back(int64_t(out_h)); + } else if (GetInputVarNode("X", node)->Var()->GetShape().size() == 4) { + out_size.push_back(int64_t(out_h)); + } + out_size.push_back(int64_t(out_w)); + size = + CreateConst(graph, + node, + {}, + {}, + {{"value", out_size}, + {"dims", std::vector{int64_t(out_size.size())}}, + {"dtype", ONNXDataType::INT64}}) + ->outputs[0]; + } else { + auto scale_value = + BOOST_GET_CONST(std::vector, op->GetAttr("scale")); + float padding = 1.0; + scale_value.insert(scale_value.begin(), padding); + scale_value.insert(scale_value.begin(), padding); + scale = CreateConst( + graph, + node, + {}, + {}, + {{"value", scale_value}, + {"dims", std::vector{int64_t(scale_value.size())}}, + {"dtype", onnx_target_dtype}}) + ->outputs[0]; + } + } + + Node *roi = + CreateConst( + graph, + node, + {}, + {}, + {{"value", + std::vector( + GetInputVarNode("X", node)->Var()->GetShape().size() * 2, 1.0)}, + {"dims", + std::vector{int64_t( + GetInputVarNode("X", node)->Var()->GetShape().size() * 2)}}, + {"dtype", onnx_target_dtype}}) + ->outputs[0]; + + if (size != nullptr) { + Node *input_shape = + CreateBaseOp( + graph, node, "popart_shape", {GetInputVarNode("X", node)}, {}) + ->outputs[0]; + Node *start = CreateConst(graph, + node, + std::vector{0}, + std::vector{1}, + ONNXDataType::INT32) + ->outputs[0]; + Node *end = CreateConst(graph, + node, + std::vector{2}, + std::vector{1}, + ONNXDataType::INT32) + ->outputs[0]; + Node *axes = CreateConst(graph, + node, + std::vector{0}, + std::vector{1}, + ONNXDataType::INT32) + ->outputs[0]; + Node *nc = CreateBaseOp(graph, + node, + "popart_slice", + {input_shape, start, end, axes}, + {}, + {}) + ->outputs[0]; + size = CreateBaseOp(graph, + node, + "popart_concat", + {nc, size}, + {}, + {{"axis", int64_t(0)}}) + ->outputs[0]; + } + auto resize_attrs = AttributeMap{ + {"coordinate_transformation_mode", coordinate_transformation_mode}, + {"cubic_coeff_a", float{-0.75}}, + {"exclude_outside", int64_t{0}}, + {"extrapolation_value", float{0.0}}, + {"mode", mode}, + {"nearest_mode", std::string("round_prefer_floor")}}; + + if (mode == "nearest" && coordinate_transformation_mode == "asymmetric") { + resize_attrs.at("nearest_mode") = std::string("floor"); + } + + return CreateBaseOp(graph, + node, + "popart_resize", + {GetInputVarNode("X", node), roi, scale, size}, + {GetOutputVarNode("Out", node)}, + resize_attrs); +} + +Node *bilinear_interp_v2_handler(Graph *graph, Node *node) { + return interp_handler(graph, node, "linear"); +} + +Node *nearest_interp_v2_handler(Graph *graph, Node *node) { + return interp_handler(graph, node, "nearest"); +} + +Node *bicubic_interp_v2_handler(Graph *graph, Node *node) { + return interp_handler(graph, node, "cubic"); +} + +Node *linear_interp_v2_handler(Graph *graph, Node *node) { + return interp_handler(graph, node, "linear"); +} + +Node *trilinear_interp_v2_handler(Graph *graph, Node *node) { + return interp_handler(graph, node, "linear"); +} + +Node *data_norm_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + + int slot_dim = -1; + if (op->HasAttr("slot_dim")) { + slot_dim = BOOST_GET_CONST(int, op->GetAttr("slot_dim")); + } + + if (slot_dim > 0) { + PADDLE_THROW( + platform::errors::InvalidArgument("slot_dim > 0 is not supported.")); + } + + bool enable_scale_and_shift = false; + if (op->HasAttr("enable_scale_and_shift")) { + enable_scale_and_shift = + BOOST_GET_CONST(bool, op->GetAttr("enable_scale_and_shift")); + } + + auto *mean_arr = CreateBaseOp(graph, + node, + "popart_div", + {GetInputVarNode("BatchSum", node), + GetInputVarNode("BatchSize", node)}, + {}) + ->outputs[0]; + auto *scale_arr = CreateBaseOp(graph, + node, + "popart_div", + {GetInputVarNode("BatchSize", node), + GetInputVarNode("BatchSquareSum", node)}, + {}) + ->outputs[0]; + scale_arr = + CreateBaseOp(graph, node, "popart_sqrt", {scale_arr}, {})->outputs[0]; + auto out = + CreateBaseOp( + graph, node, "popart_sub", {GetInputVarNode("X", node), mean_arr}, {}) + ->outputs[0]; + + if (enable_scale_and_shift) { + auto scale_res = CreateBaseOp(graph, + node, + "popart_mul", + {out, GetInputVarNode("scale_w", node)}, + {}) + ->outputs[0]; + return CreateBaseOp(graph, + node, + "popart_add", + {scale_res, GetInputVarNode("bias", node)}, + {GetOutputVarNode("Y", node)}); + } else { + return CreateBaseOp(graph, + node, + "popart_mul", + {out, scale_arr}, + {GetOutputVarNode("Y", node)}); + } +} + +Node *pad_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto mode = BOOST_GET_CONST(std::string, op->GetAttr("mode")); + auto value = BOOST_GET_CONST(float, op->GetAttr("value")); + auto data_format = BOOST_GET_CONST(std::string, op->GetAttr("data_format")); + + if (data_format == "NDHWC") { + PADDLE_THROW( + platform::errors::Unimplemented("NDHWC format is not supported.")); + } + if (mode == "replicate" || mode == "circular") { + PADDLE_THROW(platform::errors::Unimplemented( + "circular and replicate modes are not supported.")); + } + if (op->Input("Paddings").size()) { + // Paddings -> input tensor + // PopART Pad Op only support `pad` as a constant + PADDLE_THROW(platform::errors::Unimplemented( + "Do not support Paddings as a inputs tensor")); + } + // Paddings -> Attr + auto paddings = BOOST_GET_CONST(std::vector, op->GetAttr("paddings")); + std::vector new_paddings(10, 0); + new_paddings[2] = paddings[4]; + new_paddings[3] = paddings[2]; + new_paddings[4] = paddings[0]; + new_paddings[7] = paddings[5]; + new_paddings[8] = paddings[3]; + new_paddings[9] = paddings[1]; + + auto *paddings_node = CreateConst(graph, + node, + new_paddings, + std::vector{10}, + ONNXDataType::INT64) + ->outputs[0]; + auto *value_node = CreateConst(graph, + node, + std::vector{value}, + std::vector{1}, + ONNXDataType::FLOAT) + ->outputs[0]; + return CreateBaseOp(graph, + node, + "popart_pad", + {GetInputVarNode("X", node), paddings_node, value_node}, + {GetOutputVarNode("Out", node)}, + {{"mode", mode}}); +} + } // namespace } // namespace ipu } // namespace platform } // namespace paddle +REGISTER_HANDLER(affine_channel, affine_channel_handler); REGISTER_HANDLER(pool2d, pool2d_handler); REGISTER_HANDLER(max_pool2d_with_index, max_pool2d_with_index_handler); REGISTER_HANDLER(batch_norm, batch_norm_handler); @@ -388,4 +850,12 @@ REGISTER_HANDLER(group_norm, group_norm_handler); REGISTER_HANDLER(instance_norm, instance_norm_handler); REGISTER_HANDLER(layer_norm, layer_norm_handler); REGISTER_HANDLER(conv2d, conv2d_handler); +REGISTER_HANDLER(conv2d_transpose, conv2d_transpose_handler); REGISTER_HANDLER(dropout, dropout_handler); +REGISTER_HANDLER(bilinear_interp_v2, bilinear_interp_v2_handler); +REGISTER_HANDLER(nearest_interp_v2, nearest_interp_v2_handler); +REGISTER_HANDLER(bicubic_interp_v2, bicubic_interp_v2_handler); +REGISTER_HANDLER(linear_interp_v2, linear_interp_v2_handler); +REGISTER_HANDLER(trilinear_interp_v2, trilinear_interp_v2_handler); +REGISTER_HANDLER(data_norm, data_norm_handler); +REGISTER_HANDLER(pad3d, pad_handler); diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc index 9b7fb7b835235..0bf0335db0f34 100644 --- a/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/tensor_ops.cc @@ -33,10 +33,15 @@ Node *fill_constant_handler(Graph *graph, Node *node) { auto dtype = VarType2OnnxDType(static_cast(dtype_)); auto dims = BOOST_GET_CONST(std::vector, op->GetAttr("shape")); auto value_ = BOOST_GET_CONST(float, op->GetAttr("value")); - size_t size = 1; + int size = 1; for (auto &dim : dims) { size *= dim; } + PADDLE_ENFORCE_GT(size, + 0, + errors::InvalidArgument( + "IPU doesn't support non-positive dimensions. Please " + "check tensor shape setting.")); Attribute value; switch (dtype_) { case VarType::FP16: @@ -598,10 +603,15 @@ Node *fill_any_like_handler(Graph *graph, Node *node) { auto x_shape = GetInputVarNode("X", node)->Var()->GetShape(); auto dtype_ = BOOST_GET_CONST(int, op->GetAttr("dtype")); auto dtype = static_cast(dtype_); - size_t size = 1; + int size = 1; for (auto &dim : x_shape) { size *= dim; } + PADDLE_ENFORCE_GT(size, + 0, + errors::InvalidArgument( + "IPU doesn't support non-positive dimensions. Please " + "check tensor shape setting.")); Attribute out_value; switch (dtype) { @@ -748,6 +758,491 @@ Node *dot_handler(Graph *graph, Node *node) { }); } +Node *clip_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + // if (min_value == -FLT_MAX) then means no min_value + // if (max_value == FLT_MAX) then means no max_value + auto min_value = BOOST_GET_CONST(float, op->GetAttr("min")); + auto max_value = BOOST_GET_CONST(float, op->GetAttr("max")); + + bool has_min_tensor = false; + bool has_max_tensor = false; + if (node->Op()->Input("Min").size()) { + has_min_tensor = true; + } + if (node->Op()->Input("Max").size()) { + has_max_tensor = true; + } + + bool transfer_input_dtype = false; + Node *input_data = GetInputVarNode("X", node); + if (input_data->Var()->GetDataType() != VarType::FP32 && + input_data->Var()->GetDataType() != VarType::FP16) { + input_data = + CreateCast(graph, node, {input_data}, {}, VarType::FP32)->outputs[0]; + transfer_input_dtype = true; + } + + Node *min_tensor = nullptr; + if (has_min_tensor) { + if (GetInputVarNode("Min", node)->Var()->GetDataType() != VarType::FP32) { + min_tensor = + CreateCast( + graph, node, {GetInputVarNode("Min", node)}, {}, VarType::FP32) + ->outputs[0]; + } else { + min_tensor = GetInputVarNode("Min", node); + } + } else { + min_tensor = CreateConst(graph, + node, + {}, + {}, + {{"value", std::vector{min_value}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::FLOAT}}) + ->outputs[0]; + } + + Node *max_tensor = nullptr; + if (has_max_tensor) { + if (GetInputVarNode("Max", node)->Var()->GetDataType() != VarType::FP32) { + max_tensor = + CreateCast( + graph, node, {GetInputVarNode("Max", node)}, {}, VarType::FP32) + ->outputs[0]; + } else { + max_tensor = GetInputVarNode("Max", node); + } + } else { + max_tensor = CreateConst(graph, + node, + {}, + {}, + {{"value", std::vector{max_value}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::FLOAT}}) + ->outputs[0]; + } + + if (transfer_input_dtype) { + auto clip_res = CreateBaseOp( + graph, node, "popart_clip", {input_data, min_tensor, max_tensor}, {}); + return CreateCast(graph, + node, + clip_res->outputs, + {GetOutputVarNode("Out", node)}, + GetInputVarNode("X", node)->Var()->GetDataType()); + } else { + return CreateBaseOp(graph, + node, + "popart_clip", + {input_data, min_tensor, max_tensor}, + {GetOutputVarNode("Out", node)}); + } +} + +Node *dist_handler(Graph *graph, Node *node) { + // Minimum negative float + union neg_infinity { + int neg_int_inf; + float neg_float_int; + }; + neg_infinity neg_inf; + neg_inf.neg_int_inf = 0xFF800000; + float g_NegFloatInfinity = neg_inf.neg_float_int; + + auto *op = node->Op(); + auto *sub_node = + CreateBaseOp(graph, + node, + "popart_sub", + {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, + {}) + ->outputs[0]; + auto *abs_node = + CreateBaseOp(graph, node, "popart_abs", {sub_node}, {})->outputs[0]; + + auto p = BOOST_GET_CONST(float, op->GetAttr("p")); + + // Reshape to 1-D output + auto target_shape = AttributeMap{{"value", std::vector{-1}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::INT64}}; + auto *target_shape_node = + CreateBaseOp(graph, node, "popart_constant", {}, {}, target_shape) + ->outputs[0]; + + if (fabs(p) < 1e-6) { + auto *sign_node = + CreateBaseOp(graph, node, "popart_sign", {abs_node}, {})->outputs[0]; + auto *sum_node = CreateBaseOp(graph, + node, + "popart_reducesum", + {sign_node}, + {}, + {{"keepdims", int64_t{0}}}) + ->outputs[0]; + return CreateBaseOp(graph, + node, + "popart_reshape", + {sum_node, target_shape_node}, + {GetOutputVarNode("Out", node)}); + } else if (p == std::numeric_limits::infinity()) { + auto *max_node = CreateBaseOp(graph, + node, + "popart_reducemax", + {abs_node}, + {}, + {{"keepdims", int64_t{0}}}) + ->outputs[0]; + return CreateBaseOp(graph, + node, + "popart_reshape", + {max_node, target_shape_node}, + {GetOutputVarNode("Out", node)}); + } else if (p == g_NegFloatInfinity) { + auto *min_node = CreateBaseOp(graph, + node, + "popart_reducemin", + {abs_node}, + {}, + {{"keepdims", int64_t{0}}}) + ->outputs[0]; + return CreateBaseOp(graph, + node, + "popart_reshape", + {min_node, target_shape_node}, + {GetOutputVarNode("Out", node)}); + } else { + auto target_dtype = ONNXDataType::FLOAT; + if (GetInputVarNode("X", node)->Var()->GetDataType() == VarType::FP16) { + target_dtype = ONNXDataType::FLOAT16; + } + + auto pow_factor = AttributeMap{{"value", std::vector{p}}, + {"dims", std::vector{1}}, + {"dtype", target_dtype}}; + auto *pow_factor_node = + CreateBaseOp(graph, node, "popart_constant", {}, {}, pow_factor) + ->outputs[0]; + auto *pow_node = + CreateBaseOp(graph, node, "popart_pow", {abs_node, pow_factor_node}, {}) + ->outputs[0]; + auto *sum_node = CreateBaseOp(graph, + node, + "popart_reducesum", + {pow_node}, + {}, + {{"keepdims", int64_t{0}}}) + ->outputs[0]; + auto *s_node = + CreateBaseOp( + graph, node, "popart_reshape", {sum_node, target_shape_node}, {}) + ->outputs[0]; + auto *p_1 = + CreateBaseOp(graph, node, "popart_reciprocal", {pow_factor_node}, {}) + ->outputs[0]; + return CreateBaseOp(graph, + node, + "popart_pow", + {s_node, p_1}, + {GetOutputVarNode("Out", node)}); + } +} + +Node *expand_as_v2_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + Node *shape = nullptr; + auto op_inputs = op->Inputs(); + // PopART Expand Op only support the constant tensor as the input `shape`. + if (op_inputs.find("target_tensor") != op_inputs.end()) { + PADDLE_THROW(platform::errors::Unimplemented( + "Do not support input tensor `target_tensor`. Please use the attribute " + "`target_shape`.")); + } + auto input_shape = GetInputVarNode("X", node)->Var()->GetShape(); + auto shape_value = + BOOST_GET_CONST(std::vector, op->GetAttr("target_shape")); + // Check the dimensions + int input_shape_index = input_shape.size() - 1; + int target_shape_index = shape_value.size() - 1; + while (input_shape_index >= 0) { + if (input_shape[input_shape_index] != + int64_t(shape_value[target_shape_index]) && + input_shape[input_shape_index] != int64_t(1)) { + PADDLE_THROW(platform::errors::Unimplemented( + "For input and `shape`, corresponding dimensions must have the same " + "value or input dim = 1.")); + } + target_shape_index--; + input_shape_index--; + } + shape = CreateConst( + graph, + node, + {}, + {}, + {{"value", + std::vector{shape_value.begin(), shape_value.end()}}, + {"dims", std::vector{int64_t(shape_value.size())}}, + {"dtype", ONNXDataType::INT64}}) + ->outputs[0]; + return CreateBaseOp(graph, + node, + "popart_expand", + {GetInputVarNode("X", node), shape}, + {GetOutputVarNode("Out", node)}); +} + +Node *expand_v2_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + + // PopART Expand Op only support the constant tensor as the input `shape`. + if (op->Input("Shape").size()) { + PADDLE_THROW( + platform::errors::Unimplemented("Do not support input tensor `Shape`. " + "Please use the attribute `shape`.")); + } + if (op->Input("expand_shapes_tensor").size()) { + PADDLE_THROW(platform::errors::Unimplemented( + "Do not support input tensor `expand_shapes_tensor`. Please use the " + "attribute `shape`.")); + } + auto input_shape = GetInputVarNode("X", node)->Var()->GetShape(); + auto shape_value = BOOST_GET_CONST(std::vector, op->GetAttr("shape")); + // Check the dimensions + int input_shape_index = input_shape.size() - 1; + int target_shape_index = shape_value.size() - 1; + while (input_shape_index >= 0) { + if (input_shape[input_shape_index] != + int64_t(shape_value[target_shape_index]) && + input_shape[input_shape_index] != int64_t(1)) { + PADDLE_THROW(platform::errors::Unimplemented( + "For input and `shape`, corresponding dimensions must have the same " + "value or input dim = 1.")); + } + target_shape_index--; + input_shape_index--; + } + + auto *shape = + CreateConst( + graph, + node, + {}, + {}, + {{"value", + std::vector{shape_value.begin(), shape_value.end()}}, + {"dims", std::vector{int64_t(shape_value.size())}}, + {"dtype", ONNXDataType::INT64}}) + ->outputs[0]; + + return CreateBaseOp(graph, + node, + "popart_expand", + {GetInputVarNode("X", node), shape}, + {GetOutputVarNode("Out", node)}); +} + +Node *flatten_contiguous_range_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto start_axis = BOOST_GET_CONST(int, op->GetAttr("start_axis")); + auto stop_axis = BOOST_GET_CONST(int, op->GetAttr("stop_axis")); + auto input_rank = GetInputVarNode("X", node)->Var()->GetShape().size(); + + if (start_axis < 0) { + start_axis += input_rank; + } + if (stop_axis < 0) { + stop_axis += input_rank; + } + + std::vector target_shape; + if (start_axis == 0 && stop_axis == input_rank - 1) { + target_shape.push_back(-1); + } else { + auto input_shape = GetInputVarNode("X", node)->Var()->GetShape(); + if (start_axis == 0) { + target_shape.assign(input_shape.begin() + stop_axis + 1, + input_shape.end()); + target_shape.insert(target_shape.begin(), -1); + } else if (stop_axis == input_rank - 1) { + target_shape.assign(input_shape.begin(), + input_shape.begin() + start_axis); + target_shape.push_back(-1); + } else { + target_shape.insert(target_shape.begin(), + input_shape.begin(), + input_shape.begin() + start_axis); + target_shape.push_back(-1); + target_shape.insert(target_shape.end(), + input_shape.begin() + stop_axis + 1, + input_shape.end()); + } + } + auto *unknown_dim_node = CreateConst(graph, + node, + target_shape, + {int64_t(target_shape.size())}, + ONNXDataType::INT64) + ->outputs[0]; + return CreateBaseOp(graph, + node, + "popart_reshape", + {GetInputVarNode("X", node), unknown_dim_node}, + {GetOutputVarNode("Out", node)}, + {}); +} + +Node *flip_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto axes = BOOST_GET_CONST(std::vector, op->GetAttr("axis")); + auto input_shape = GetInputVarNode("X", node)->Var()->GetShape(); + for (auto it = axes.begin(); it != axes.end();) { + if (*it < 0) { + *it += input_shape.size(); + } + // Remove input_shape[axis] == 1 + if (input_shape[*it] == 1) { + it = axes.erase(it); + } else { + it++; + } + } + auto *temp_node = GetInputVarNode("X", node); + for (auto i = 0; i < axes.size(); i++) { + auto axis = axes[i]; + std::vector split; + split.resize(input_shape[axis], 1); + std::vector splits_output_nodes; + for (int j = 0; j < split.size(); j++) { + splits_output_nodes.push_back(MakeVarNode(graph, node)); + } + auto splits_outputs = CreateBaseOp(graph, + node, + "popart_split", + {temp_node}, + {splits_output_nodes}, + {{"num_outputs", int64_t(split.size())}, + {"axis", int64_t(axis)}, + {"split", split}}) + ->outputs; + std::reverse(splits_outputs.begin(), splits_outputs.end()); + if (i != axes.size() - 1) { + temp_node = CreateBaseOp(graph, + node, + "popart_concat", + splits_outputs, + {}, + {{"axis", int64_t(axis)}}) + ->outputs[0]; + } else { + temp_node = CreateBaseOp(graph, + node, + "popart_concat", + splits_outputs, + {}, + {{"axis", int64_t(axis)}}) + ->outputs[0]; + } + } + // In case of `axis` is empty. Identity Op will be deleted in passes. + return CreateBaseOp(graph, + node, + "popart_identity", + {temp_node}, + {GetOutputVarNode("Out", node)}, + {}); +} + +Node *meshgrid_handler(Graph *graph, Node *node) { + Node *res = nullptr; + // All inputs are 1-D tensors + std::vector out_shape; + for (auto input : node->inputs) { + auto input_shape = input->Var()->GetShape(); + out_shape.push_back(input_shape[0]); + } + // Expand Op only allows a const tensor as `shape` + auto *out_shape_node = CreateConst(graph, + node, + out_shape, + {int64_t(out_shape.size())}, + ONNXDataType::INT64) + ->outputs[0]; + + for (int i = 0; i < node->inputs.size(); i++) { + // Reshape each input tensor to [node->inputs.size()] by filling with 1 + std::vector target_shape(node->inputs.size(), 1); + target_shape[i] = node->inputs[i]->Var()->GetShape()[0]; + auto *target_shape_node = CreateConst(graph, + node, + target_shape, + {int64_t(target_shape.size())}, + ONNXDataType::INT64) + ->outputs[0]; + auto *t_reshaped = CreateBaseOp(graph, + node, + "popart_reshape", + {node->inputs[i], target_shape_node}, + {}, + {}) + ->outputs[0]; + res = CreateBaseOp(graph, + node, + "popart_expand", + {t_reshaped, out_shape_node}, + {node->outputs[i]}); + } + return res; +} + +Node *p_norm_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto keepdim = BOOST_GET_CONST(bool, op->GetAttr("keepdim")); + auto axis = BOOST_GET_CONST(int, op->GetAttr("axis")); + auto porder = BOOST_GET_CONST(float, op->GetAttr("porder")); + + auto target_dtype = ONNXDataType::FLOAT; + if (GetInputVarNode("X", node)->Var()->GetDataType() == VarType::FP16) { + target_dtype = ONNXDataType::FLOAT16; + } + + auto *pnode = CreateConst(graph, + node, + std::vector{porder}, + std::vector{1}, + target_dtype) + ->outputs[0]; + auto *abs_node = + CreateBaseOp(graph, node, "popart_abs", {GetInputVarNode("X", node)}, {}) + ->outputs[0]; + auto *pow_node = + CreateBaseOp(graph, node, "popart_pow", {abs_node, pnode}, {}) + ->outputs[0]; + auto *reducesum_node = CreateBaseOp(graph, + node, + "popart_reducesum", + {pow_node}, + {}, + {{"axes", std::vector{axis}}, + {"keepdims", int64_t(keepdim)}}) + ->outputs[0]; + auto *pnode1 = + CreateConst(graph, + node, + std::vector{static_cast(1.0 / porder)}, + std::vector{1}, + target_dtype) + ->outputs[0]; + return CreateBaseOp(graph, + node, + "popart_pow", + {reducesum_node, pnode1}, + {GetOutputVarNode("Out", node)}); +} + } // namespace } // namespace ipu } // namespace platform @@ -759,6 +1254,7 @@ REGISTER_HANDLER(uniform_random, uniform_random_handler); REGISTER_HANDLER(transpose2, transpose_handler); REGISTER_HANDLER(reshape2, reshape_handler); REGISTER_HANDLER(flatten2, flatten2_handler); +REGISTER_HANDLER(flatten_contiguous_range, flatten_contiguous_range_handler); REGISTER_HANDLER(gather, gather_handler); REGISTER_HANDLER(squeeze2, squeeze_handler); REGISTER_HANDLER(cast, cast_handler); @@ -769,6 +1265,8 @@ REGISTER_HANDLER(stack, stack_handler); REGISTER_HANDLER(shape, shape_handler); REGISTER_HANDLER(slice, slice_handler); REGISTER_HANDLER(expand, expand_handler); +REGISTER_HANDLER(expand_v2, expand_v2_handler); +REGISTER_HANDLER(expand_as_v2, expand_as_v2_handler); REGISTER_HANDLER(assign, assign_handler); REGISTER_HANDLER(assign_value, assign_value_handler); REGISTER_HANDLER(fill_any_like, fill_any_like_handler); @@ -777,3 +1275,8 @@ REGISTER_HANDLER(split, split_handler); REGISTER_HANDLER(one_hot, one_hot_handler); REGISTER_HANDLER(one_hot_v2, one_hot_v2_handler); REGISTER_HANDLER(dot, dot_handler); +REGISTER_HANDLER(clip, clip_handler); +REGISTER_HANDLER(dist, dist_handler); +REGISTER_HANDLER(flip, flip_handler); +REGISTER_HANDLER(meshgrid, meshgrid_handler); +REGISTER_HANDLER(p_norm, p_norm_handler); From be746adf7e706950c585e9295ba15134a8f5dbdd Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Tue, 12 Jul 2022 10:33:02 +0800 Subject: [PATCH 131/250] [operator migration] Migrate kernel of unique consecutive op. (#44228) --- .../fluid/operators/unique_consecutive_op.cc | 8 +- .../fluid/operators/unique_consecutive_op.h | 287 --------- .../kernels/cpu/unique_consecutive_functor.h | 261 ++++++++ .../kernels/cpu/unique_consecutive_kernel.cc | 77 +++ .../kernels/gpu/unique_consecutive_functor.h} | 585 +++++++++--------- .../kernels/gpu/unique_consecutive_kernel.cu | 81 +++ .../phi/kernels/unique_consecutive_kernel.h | 34 + .../phi/ops/compat/unique_consecutive_sig.cc | 30 + 8 files changed, 764 insertions(+), 599 deletions(-) delete mode 100644 paddle/fluid/operators/unique_consecutive_op.h create mode 100644 paddle/phi/kernels/cpu/unique_consecutive_functor.h create mode 100644 paddle/phi/kernels/cpu/unique_consecutive_kernel.cc rename paddle/{fluid/operators/unique_consecutive_op.cu => phi/kernels/gpu/unique_consecutive_functor.h} (53%) create mode 100644 paddle/phi/kernels/gpu/unique_consecutive_kernel.cu create mode 100644 paddle/phi/kernels/unique_consecutive_kernel.h create mode 100644 paddle/phi/ops/compat/unique_consecutive_sig.cc diff --git a/paddle/fluid/operators/unique_consecutive_op.cc b/paddle/fluid/operators/unique_consecutive_op.cc index 73f6918d52598..0a36af362deb0 100644 --- a/paddle/fluid/operators/unique_consecutive_op.cc +++ b/paddle/fluid/operators/unique_consecutive_op.cc @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/unique_consecutive_op.h" - +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -118,11 +117,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(unique_consecutive, ops::UniqueConsecutiveOp, ops::UniqueConsecutiveOpMaker); -REGISTER_OP_CPU_KERNEL(unique_consecutive, - ops::UniqueConsecutiveKernel, - ops::UniqueConsecutiveKernel, - ops::UniqueConsecutiveKernel, - ops::UniqueConsecutiveKernel); REGISTER_OP_VERSION(unique_consecutive) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/unique_consecutive_op.h b/paddle/fluid/operators/unique_consecutive_op.h deleted file mode 100644 index b0eadbd877de5..0000000000000 --- a/paddle/fluid/operators/unique_consecutive_op.h +++ /dev/null @@ -1,287 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/operators/unique_op.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { -template -static void UniqueConsecutiveFlattendTensor( - const framework::ExecutionContext& context, - const framework::Tensor& in, - framework::Tensor* out, - bool return_inverse, - bool return_counts) { - const InT* in_data = in.data(); - std::vector out_vec(in.numel()); - std::vector inverse_vec(in.numel()); - std::vector counts_vec(in.numel()); - memcpy(out_vec.data(), in_data, in.numel() * sizeof(InT)); - InT* p = out_vec.data(); - int64_t last = 0; - IndexT* q = counts_vec.data(); - for (int64_t i = 0; i < in.numel(); i++) { - if (in_data[i] != *p) { - *(++p) = in_data[i]; - if (return_counts) { - *(q++) = i - last; - last = i; - } - } - if (return_inverse) { - inverse_vec[i] = p - out_vec.data(); - } - } - - int64_t output_size = p - out_vec.data() + 1; - if (return_counts) { - *q = in.numel() - last; - counts_vec.resize(output_size); - } - out_vec.resize(output_size); - - out->Resize(phi::make_ddim({output_size})); - auto* out_data = out->mutable_data(context.GetPlace()); - std::copy(out_vec.begin(), out_vec.end(), out_data); - - if (return_inverse) { - auto* inverse = context.Output("Index"); - inverse->Resize(phi::make_ddim({in.numel()})); - auto* inverse_data = inverse->mutable_data(context.GetPlace()); - std::copy(inverse_vec.begin(), inverse_vec.end(), inverse_data); - } - - if (return_counts) { - auto* count = context.Output("Counts"); - count->Resize(phi::make_ddim({out->numel()})); - auto* counts_data = count->mutable_data(context.GetPlace()); - std::copy(counts_vec.begin(), counts_vec.end(), counts_data); - } -} - -template -static ForwardIt UniqueConsecutiveDimImpl( - const framework::ExecutionContext& context, - ForwardIt first, - ForwardIt last, - const std::vector& sorted_indices_vec, - std::vector* inverse_vec, - std::vector* counts_vec) { - if (first == last) { - return last; - } - - (*inverse_vec)[sorted_indices_vec[0]] = 0; - (*counts_vec)[0] = 1; - - ForwardIt begin = first; - ForwardIt result = first; - - while (++first != last) { - int64_t idx_first = std::distance(begin, first); - int64_t idx_result = std::distance(begin, result); - if (!Equal(*result, *first)) { - if (++result != first) { - *result = std::move(*first); - } - idx_result += 1; - } - (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result; - (*counts_vec)[idx_result] += 1; - } - return ++result; -} - -template -static void UniqueConsecutiveDim(const framework::ExecutionContext& context, - const framework::Tensor& in, - framework::Tensor* out, - bool return_inverse, - bool return_counts, - int axis) { - // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] - std::vector permute(in.dims().size()); - std::iota(permute.begin(), permute.end(), 0); - permute[axis] = 0; - permute[0] = axis; - std::vector in_trans_dims_vec(phi::vectorize(in.dims())); - in_trans_dims_vec[axis] = in.dims()[0]; - in_trans_dims_vec[0] = in.dims()[axis]; - framework::Tensor in_trans; - framework::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec); - in_trans.Resize(in_trans_dims); - in_trans.mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - TransCompute( - in.dims().size(), dev_ctx, in, &in_trans, permute); - // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] - framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); - in_trans.Resize(in_trans_flat_dims); - - std::vector sorted_indices_vec(in_trans.dims()[0]); - std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0); - int64_t col = in_trans.dims()[1]; - const InT* in_trans_data = in_trans.data(); - - // sort tensor according to indices - framework::Tensor input_sorted; - input_sorted.Resize(in_trans_dims); - input_sorted.mutable_data(context.GetPlace()); - InT* input_sorted_data = input_sorted.data(); - for (size_t i = 0; i < sorted_indices_vec.size(); ++i) { - memcpy(input_sorted_data + i * col, - in_trans_data + static_cast(sorted_indices_vec[i]) * col, - col * sizeof(InT)); - } - std::vector input_unbind = Unbind(input_sorted); - std::vector inverse_vec(sorted_indices_vec.size(), 0); - std::vector counts_vec(sorted_indices_vec.size(), 0); - auto last = - UniqueConsecutiveDimImpl::iterator, InT>( - context, - input_unbind.begin(), - input_unbind.end(), - sorted_indices_vec, - &inverse_vec, - &counts_vec); - input_unbind.erase(last, input_unbind.end()); - counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end()); - - math::ConcatFunctor concat_functor; - framework::Tensor out_trans; - std::vector out_trans_dims_vec = in_trans_dims_vec; - out_trans_dims_vec[0] = input_unbind.size(); - out_trans.Resize(phi::make_ddim(out_trans_dims_vec)); - out_trans.mutable_data(context.GetPlace()); - std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); - out->Resize(phi::make_ddim(out_trans_dims_vec)); - out->mutable_data(context.GetPlace()); - concat_functor(dev_ctx, input_unbind, 0, &out_trans); - TransCompute( - out_trans.dims().size(), dev_ctx, out_trans, out, permute); - if (return_inverse) { - auto* inverse = context.Output("Index"); - framework::TensorFromVector(inverse_vec, context.device_context(), inverse); - } - if (return_counts) { - auto* count = context.Output("Counts"); - framework::TensorFromVector(counts_vec, context.device_context(), count); - } -} - -template -struct UniqueConsecutiveFlattendTensorFunctor { - const framework::ExecutionContext& ctx_; - const framework::Tensor& in_; - framework::Tensor* out_; - const bool return_inverse_; - const bool return_counts_; - - UniqueConsecutiveFlattendTensorFunctor( - const framework::ExecutionContext& context, - const framework::Tensor& in, - framework::Tensor* out, - bool return_inverse, - bool return_counts) - : ctx_(context), - in_(in), - out_(out), - return_inverse_(return_inverse), - return_counts_(return_counts) {} - - template - void apply() const { - UniqueConsecutiveFlattendTensor( - ctx_, in_, out_, return_inverse_, return_counts_); - } -}; - -template -struct UniqueConsecutiveDimFunctor { - const framework::ExecutionContext& ctx_; - const framework::Tensor& in_; - framework::Tensor* out_; - const int axis_; - const bool return_inverse_; - const bool return_counts_; - UniqueConsecutiveDimFunctor(const framework::ExecutionContext& context, - const framework::Tensor& in, - framework::Tensor* out, - const int axis, - bool return_inverse, - bool return_counts) - : ctx_(context), - in_(in), - out_(out), - axis_(axis), - return_inverse_(return_inverse), - return_counts_(return_counts) {} - - template - void apply() const { - UniqueConsecutiveDim( - ctx_, in_, out_, return_inverse_, return_counts_, axis_); - } -}; -template -class UniqueConsecutiveKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - auto data_type = static_cast( - context.Attr("dtype")); - if (data_type == framework::proto::VarType::INT32) { - PADDLE_ENFORCE_LE( - x->numel(), - INT_MAX, - platform::errors::InvalidArgument( - "The number of elements in Input(X) should be less than or " - "equal to INT_MAX, but received num is %d. Please set `dtype` to " - "int64.", - x->numel())); - } - std::vector axis_vec = context.Attr>("axis"); - bool return_inverse = context.Attr("return_inverse"); - bool return_counts = context.Attr("return_counts"); - - if (axis_vec.empty()) { - framework::VisitDataTypeTiny( - data_type, - UniqueConsecutiveFlattendTensorFunctor( - context, *x, out, return_inverse, return_counts)); - } else { - int axis = axis_vec[0]; - framework::VisitDataTypeTiny( - data_type, - UniqueConsecutiveDimFunctor( - context, *x, out, axis, return_inverse, return_counts)); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/unique_consecutive_functor.h b/paddle/phi/kernels/cpu/unique_consecutive_functor.h new file mode 100644 index 0000000000000..85081e5806933 --- /dev/null +++ b/paddle/phi/kernels/cpu/unique_consecutive_functor.h @@ -0,0 +1,261 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/tensor_util.h" + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/unique_functor.h" + +namespace phi { + +template +static void UniqueConsecutiveFlattenedTensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + bool return_inverse, + bool return_counts, + DenseTensor* inverse, + DenseTensor* count) { + const InT* in_data = in.data(); + std::vector out_vec(in.numel()); + std::vector inverse_vec(in.numel()); + std::vector counts_vec(in.numel()); + memcpy(out_vec.data(), in_data, in.numel() * sizeof(InT)); + InT* p = out_vec.data(); + int64_t last = 0; + IndexT* q = counts_vec.data(); + for (int64_t i = 0; i < in.numel(); i++) { + if (in_data[i] != *p) { + *(++p) = in_data[i]; + if (return_counts) { + *(q++) = i - last; + last = i; + } + } + if (return_inverse) { + inverse_vec[i] = p - out_vec.data(); + } + } + + int64_t output_size = p - out_vec.data() + 1; + if (return_counts) { + *q = in.numel() - last; + counts_vec.resize(output_size); + } + out_vec.resize(output_size); + + out->Resize(phi::make_ddim({output_size})); + auto* out_data = context.template Alloc(out); + std::copy(out_vec.begin(), out_vec.end(), out_data); + + if (return_inverse) { + inverse->Resize(phi::make_ddim({in.numel()})); + auto* inverse_data = context.template Alloc(inverse); + std::copy(inverse_vec.begin(), inverse_vec.end(), inverse_data); + } + + if (return_counts) { + count->Resize(phi::make_ddim({out->numel()})); + auto* counts_data = context.template Alloc(count); + std::copy(counts_vec.begin(), counts_vec.end(), counts_data); + } +} + +template +struct UniqueConsecutiveFlattenedTensorFunctor { + const Context& ctx_; + const DenseTensor& in_; + DenseTensor* out_; + const bool return_inverse_; + const bool return_counts_; + DenseTensor* inverse_; + DenseTensor* count_; + + UniqueConsecutiveFlattenedTensorFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + bool return_inverse, + bool return_counts, + DenseTensor* inverse, + DenseTensor* count) + : ctx_(context), + in_(in), + out_(out), + return_inverse_(return_inverse), + return_counts_(return_counts), + inverse_(inverse), + count_(count) {} + + template + void apply() const { + UniqueConsecutiveFlattenedTensor( + ctx_, in_, out_, return_inverse_, return_counts_, inverse_, count_); + } +}; + +template +static ForwardIt UniqueConsecutiveDimImpl( + const Context& context, + ForwardIt first, + ForwardIt last, + const std::vector& sorted_indices_vec, + std::vector* inverse_vec, + std::vector* counts_vec) { + if (first == last) { + return last; + } + + (*inverse_vec)[sorted_indices_vec[0]] = 0; + (*counts_vec)[0] = 1; + + ForwardIt begin = first; + ForwardIt result = first; + + while (++first != last) { + int64_t idx_first = std::distance(begin, first); + int64_t idx_result = std::distance(begin, result); + if (!phi::funcs::Equal(*result, *first)) { + if (++result != first) { + *result = std::move(*first); + } + idx_result += 1; + } + (*inverse_vec)[sorted_indices_vec[idx_first]] = idx_result; + (*counts_vec)[idx_result] += 1; + } + return ++result; +} + +template +static void UniqueConsecutiveDim(const Context& context, + const DenseTensor& in, + DenseTensor* out, + bool return_inverse, + bool return_counts, + int axis, + DenseTensor* inverse, + DenseTensor* count) { + // transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] + std::vector permute(in.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis] = 0; + permute[0] = axis; + std::vector in_trans_dims_vec(phi::vectorize(in.dims())); + in_trans_dims_vec[axis] = in.dims()[0]; + in_trans_dims_vec[0] = in.dims()[axis]; + DenseTensor in_trans; + DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec); + in_trans.Resize(in_trans_dims); + context.template Alloc(&in_trans); + phi::funcs::TransCompute( + in.dims().size(), context, in, &in_trans, permute); + // reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] + DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); + in_trans.Resize(in_trans_flat_dims); + + std::vector sorted_indices_vec(in_trans.dims()[0]); + std::iota(sorted_indices_vec.begin(), sorted_indices_vec.end(), 0); + int64_t col = in_trans.dims()[1]; + const InT* in_trans_data = in_trans.data(); + + // sort tensor according to indices + DenseTensor input_sorted; + input_sorted.Resize(in_trans_dims); + context.template Alloc(&input_sorted); + InT* input_sorted_data = input_sorted.data(); + for (size_t i = 0; i < sorted_indices_vec.size(); ++i) { + memcpy(input_sorted_data + i * col, + in_trans_data + static_cast(sorted_indices_vec[i]) * col, + col * sizeof(InT)); + } + std::vector input_unbind = phi::funcs::Unbind(input_sorted); + std::vector inverse_vec(sorted_indices_vec.size(), 0); + std::vector counts_vec(sorted_indices_vec.size(), 0); + auto last = UniqueConsecutiveDimImpl::iterator, + InT>(context, + input_unbind.begin(), + input_unbind.end(), + sorted_indices_vec, + &inverse_vec, + &counts_vec); + input_unbind.erase(last, input_unbind.end()); + counts_vec.erase(counts_vec.begin() + input_unbind.size(), counts_vec.end()); + + phi::funcs::ConcatFunctor concat_functor; + DenseTensor out_trans; + std::vector out_trans_dims_vec = in_trans_dims_vec; + out_trans_dims_vec[0] = input_unbind.size(); + out_trans.Resize(phi::make_ddim(out_trans_dims_vec)); + context.template Alloc(&out_trans); + std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); + out->Resize(phi::make_ddim(out_trans_dims_vec)); + context.template Alloc(out); + concat_functor(context, input_unbind, 0, &out_trans); + phi::funcs::TransCompute( + out_trans.dims().size(), context, out_trans, out, permute); + if (return_inverse) { + paddle::framework::TensorFromVector(inverse_vec, context, inverse); + } + if (return_counts) { + paddle::framework::TensorFromVector(counts_vec, context, count); + } +} + +template +struct UniqueConsecutiveDimFunctor { + const Context& ctx_; + const DenseTensor& in_; + DenseTensor* out_; + const int axis_; + const bool return_inverse_; + const bool return_counts_; + DenseTensor* inverse_; + DenseTensor* count_; + + UniqueConsecutiveDimFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + const int axis, + bool return_inverse, + bool return_counts, + DenseTensor* inverse, + DenseTensor* count) + : ctx_(context), + in_(in), + out_(out), + axis_(axis), + return_inverse_(return_inverse), + return_counts_(return_counts), + inverse_(inverse), + count_(count) {} + + template + void apply() const { + UniqueConsecutiveDim(ctx_, + in_, + out_, + return_inverse_, + return_counts_, + axis_, + inverse_, + count_); + } +}; + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc new file mode 100644 index 0000000000000..86fe53b72c985 --- /dev/null +++ b/paddle/phi/kernels/cpu/unique_consecutive_kernel.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unique_consecutive_kernel.h" +#include "paddle/phi/kernels/cpu/unique_consecutive_functor.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/errors.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/data_type.h" + +namespace phi { + +template +void UniqueConsecutiveKernel(const Context& dev_ctx, + const DenseTensor& x, + bool return_inverse, + bool return_counts, + const std::vector& axis, + int dtype, + DenseTensor* out, + DenseTensor* index, + DenseTensor* counts) { + auto data_type = static_cast(dtype); + if (data_type == paddle::framework::proto::VarType::INT32) { + PADDLE_ENFORCE_LE( + x.numel(), + INT_MAX, + phi::errors::InvalidArgument( + "The number of elements in Input(X) should be less than or " + "equal to INT_MAX, but received num is %d. Please set `dtype` to " + "int64.", + x.numel())); + } + + if (axis.empty()) { + paddle::framework::VisitDataTypeTiny( + data_type, + UniqueConsecutiveFlattenedTensorFunctor( + dev_ctx, x, out, return_inverse, return_counts, index, counts)); + } else { + int valid_axis = axis[0]; + paddle::framework::VisitDataTypeTiny( + data_type, + UniqueConsecutiveDimFunctor(dev_ctx, + x, + out, + valid_axis, + return_inverse, + return_counts, + index, + counts)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(unique_consecutive, + CPU, + ALL_LAYOUT, + phi::UniqueConsecutiveKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/fluid/operators/unique_consecutive_op.cu b/paddle/phi/kernels/gpu/unique_consecutive_functor.h similarity index 53% rename from paddle/fluid/operators/unique_consecutive_op.cu rename to paddle/phi/kernels/gpu/unique_consecutive_functor.h index b96499cdb20e8..e603f695039c0 100644 --- a/paddle/fluid/operators/unique_consecutive_op.cu +++ b/paddle/phi/kernels/gpu/unique_consecutive_functor.h @@ -1,16 +1,18 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once #include #include #include @@ -22,13 +24,204 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/tensor_util.h" // TensorToVector() -#include "paddle/fluid/operators/unique_consecutive_op.h" // TransComute() +#include "paddle/fluid/framework/tensor_util.h" + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/unique_functor.h" + +namespace phi { + +// The core logic of computing Unique Consecutive for a flattend Tensor +template +static void UniqueConsecutiveFlattenedCUDATensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + bool return_inverse, + bool return_counts, + equal_T equal, + not_equal_T not_equal, + int64_t num_input, + DenseTensor* inverse, + DenseTensor* counts) { + // 0. Preparation + DenseTensor in_hat; + phi::Copy(context, in, context.GetPlace(), false, &in_hat); + auto in_data_hat = context.template Alloc(&in_hat); + + DenseTensor sorted_indices; + sorted_indices.Resize(phi::make_ddim({num_input})); + auto sorted_indices_data = context.template Alloc(&sorted_indices); + thrust::sequence( + thrust::device, sorted_indices_data, sorted_indices_data + num_input); + // 1. Calculate op result: 'out' + DenseTensor range; + range.Resize(phi::make_ddim({num_input + 1})); + auto range_data_ptr = context.template Alloc(&range); + thrust::sequence( + thrust::device, range_data_ptr, range_data_ptr + num_input + 1); + phi::Copy(context, in_hat, context.GetPlace(), false, out); + int num_out; + auto out_data = context.template Alloc(out); + num_out = + thrust::unique_by_key( + thrust::device, out_data, out_data + num_input, range_data_ptr, equal) + .first - + out_data; + out->Resize(phi::make_ddim({num_out})); + + // 2. Calculate inverse index: 'inverse' + if (return_inverse) { + inverse->Resize(phi::make_ddim({num_input})); + auto inverse_data = context.template Alloc(inverse); + DenseTensor inv_loc; + inv_loc.Resize(phi::make_ddim({num_input})); + auto inv_loc_data_ptr = context.template Alloc(&inv_loc); + thrust::adjacent_difference(thrust::device, + in_data_hat, + in_data_hat + num_input, + inv_loc_data_ptr, + not_equal); + thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); + inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault + thrust::inclusive_scan(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + num_input, + inv_loc_data_ptr); + thrust::scatter(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + num_input, + sorted_indices_data, + inverse_data); + } + // 3. Calculate 'counts' + if (return_counts) { + counts->Resize(phi::make_ddim({num_out})); + auto count_data = context.template Alloc(counts); + // init 'count_data' as 0 + thrust::fill(thrust::device, count_data, count_data + num_out, 0); + thrust::device_ptr range_data_ptr_dev(range_data_ptr); + range_data_ptr_dev[num_out] = num_input; + thrust::adjacent_difference(thrust::device, + range_data_ptr + 1, + range_data_ptr + num_out + 1, + count_data); + } +} + +// functor for processing a flattend Tensor +template +struct UniqueConsecutiveFlattenedCUDAFunctor { + const Context& ctx_; + const DenseTensor& in_; + DenseTensor* out_; + const bool return_inverse_; + const bool return_counts_; + DenseTensor* inverse_; + DenseTensor* count_; + + UniqueConsecutiveFlattenedCUDAFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + bool return_inverse, + bool return_counts, + DenseTensor* inverse, + DenseTensor* count) + : ctx_(context), + in_(in), + out_(out), + return_inverse_(return_inverse), + return_counts_(return_counts), + inverse_(inverse), + count_(count) {} + + template + void apply() const { + UniqueConsecutiveFlattenedCUDATensor( + ctx_, + in_, + out_, + return_inverse_, + return_counts_, + thrust::equal_to(), + thrust::not_equal_to(), + in_.numel(), + inverse_, + count_); + } +}; -namespace paddle { -namespace operators { +// The logic of compute unique with axis required, it's a little different +// from above function +template +static void ComputeUniqueConsecutiveDims(const Context& context, + DenseTensor* sorted_indices, + IndexT* sorted_indices_data, + DenseTensor* out, + bool return_inverse, + bool return_counts, + equal_T equal, + not_equal_T not_equal, + int64_t row, + DenseTensor* inverse, + DenseTensor* counts) { + // 1. inverse indices: 'inverse' + inverse->Resize(phi::make_ddim({row})); + auto inverse_data = context.template Alloc(inverse); + DenseTensor inv_loc; + inv_loc.Resize(phi::make_ddim({row})); + auto inv_loc_data_ptr = context.template Alloc(&inv_loc); + thrust::adjacent_difference(thrust::device, + sorted_indices_data, + sorted_indices_data + row, + inv_loc_data_ptr, + not_equal); + thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); + inv_loc_data_dev[0] = 0; + thrust::inclusive_scan(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + row, + inv_loc_data_ptr); + thrust::scatter(thrust::device, + inv_loc_data_ptr, + inv_loc_data_ptr + row, + sorted_indices_data, + inverse_data); + + // 2. sorted indices + DenseTensor range; + range.Resize(phi::make_ddim({row + 1})); + auto range_data_ptr = context.template Alloc(&range); + thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1); + int num_out; + num_out = thrust::unique_by_key(thrust::device, + sorted_indices_data, + sorted_indices_data + row, + range_data_ptr, + equal) + .first - + sorted_indices_data; + thrust::device_ptr range_data_ptr_dev(range_data_ptr); + range_data_ptr_dev[num_out] = row; + sorted_indices->Resize(phi::make_ddim({num_out})); -using Tensor = framework::Tensor; + // 3. counts: 'counts' + counts->Resize(phi::make_ddim({num_out})); + auto count_data = context.template Alloc(counts); + thrust::fill(thrust::device, count_data, count_data + row, 0); + thrust::adjacent_difference( + thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data); +} // Binary function 'equal_to' template @@ -73,11 +266,11 @@ struct BinaryNotEqual { }; // index_select() function for Tensor -template -void IndexSelect(const framework::ExecutionContext& context, - const Tensor& input, - const Tensor& index, - Tensor* output, +template +void IndexSelect(const Context& context, + const DenseTensor& input, + const DenseTensor& index, + DenseTensor* output, int dim) { auto input_dim = input.dims(); auto input_dim_size = input_dim.size(); @@ -100,17 +293,15 @@ void IndexSelect(const framework::ExecutionContext& context, std::vector input_vec; std::vector index_vec; - paddle::framework::TensorToVector( - input, context.device_context(), &input_vec); - paddle::framework::TensorToVector( - index, context.device_context(), &index_vec); + paddle::framework::TensorToVector(input, context, &input_vec); + paddle::framework::TensorToVector(index, context, &index_vec); std::vector out_vec(output->numel()); for (int i = 0; i < index_size; i++) { PADDLE_ENFORCE_GE( index_vec[i], 0, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Variable value (index) of OP(index_select) " "expected >= 0 and < %ld, but got %ld. Please check input " "value.", @@ -119,7 +310,7 @@ void IndexSelect(const framework::ExecutionContext& context, PADDLE_ENFORCE_LT( index_vec[i], input_dim[dim], - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Variable value (index) of OP(index_select) " "expected >= 0 and < %ld, but got %ld. Please check input " "value.", @@ -139,162 +330,21 @@ void IndexSelect(const framework::ExecutionContext& context, } } } - output->mutable_data(context.GetPlace()); - framework::TensorFromVector(out_vec, context.device_context(), output); + context.template Alloc(output); + paddle::framework::TensorFromVector(out_vec, context, output); output->Resize(output_dim); } -// The core logic of computing Unique Consecutive for a flattend Tensor -template -static void UniqueConsecutiveFlattendCUDATensor( - const framework::ExecutionContext& context, - const Tensor& in, - Tensor* out, - bool return_inverse, - bool return_counts, - equal_T equal, - not_equal_T not_equal, - int64_t num_input) { - // 0. Prepration - Tensor in_hat; - framework::TensorCopy(in, context.GetPlace(), &in_hat); - auto in_data_hat = in_hat.mutable_data(context.GetPlace()); - - Tensor sorted_indices; - sorted_indices.Resize(phi::make_ddim({num_input})); - auto sorted_indices_data = - sorted_indices.mutable_data(context.GetPlace()); - thrust::sequence( - thrust::device, sorted_indices_data, sorted_indices_data + num_input); - // 1. Calculate op result: 'out' - Tensor range; - range.Resize(phi::make_ddim({num_input + 1})); - auto range_data_ptr = range.mutable_data(context.GetPlace()); - thrust::sequence( - thrust::device, range_data_ptr, range_data_ptr + num_input + 1); - framework::TensorCopy(in_hat, context.GetPlace(), out); - int num_out; - auto out_data = out->mutable_data(context.GetPlace()); - num_out = - thrust::unique_by_key( - thrust::device, out_data, out_data + num_input, range_data_ptr, equal) - .first - - out_data; - out->Resize(phi::make_ddim({num_out})); - - // 2. Calculate inverse index: 'inverse' - if (return_inverse) { - Tensor* inverse = context.Output("Index"); - inverse->Resize(phi::make_ddim({num_input})); - auto inverse_data = inverse->mutable_data(context.GetPlace()); - Tensor inv_loc; - inv_loc.Resize(phi::make_ddim({num_input})); - auto inv_loc_data_ptr = inv_loc.mutable_data(context.GetPlace()); - thrust::adjacent_difference(thrust::device, - in_data_hat, - in_data_hat + num_input, - inv_loc_data_ptr, - not_equal); - thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); - inv_loc_data_dev[0] = 0; // without device_ptr, segmentation fault - thrust::inclusive_scan(thrust::device, - inv_loc_data_ptr, - inv_loc_data_ptr + num_input, - inv_loc_data_ptr); - thrust::scatter(thrust::device, - inv_loc_data_ptr, - inv_loc_data_ptr + num_input, - sorted_indices_data, - inverse_data); - } - // 3. Calculate 'counts' - if (return_counts) { - Tensor* counts = context.Output("Counts"); - counts->Resize(phi::make_ddim({num_out})); - auto count_data = counts->mutable_data(context.GetPlace()); - // init 'count_data' as 0 - thrust::fill(thrust::device, count_data, count_data + num_out, 0); - thrust::device_ptr range_data_ptr_dev(range_data_ptr); - range_data_ptr_dev[num_out] = num_input; - thrust::adjacent_difference(thrust::device, - range_data_ptr + 1, - range_data_ptr + num_out + 1, - count_data); - } -} - -// The logic of compute unique with axis required, it's a little different -// from above function -template -static void ComputeUniqueConsecutiveDims( - const framework::ExecutionContext& context, - Tensor* sorted_indices, - IndexT* sorted_indices_data, - Tensor* out, - bool return_inverse, - bool return_counts, - equal_T equal, - not_equal_T not_equal, - int64_t row) { - // 1. inverse indices: 'inverse' - Tensor* inverse = context.Output("Index"); - inverse->Resize(phi::make_ddim({row})); - auto inverse_data = inverse->mutable_data(context.GetPlace()); - Tensor inv_loc; - inv_loc.Resize(phi::make_ddim({row})); - auto inv_loc_data_ptr = inv_loc.mutable_data(context.GetPlace()); - thrust::adjacent_difference(thrust::device, - sorted_indices_data, - sorted_indices_data + row, - inv_loc_data_ptr, - not_equal); - thrust::device_ptr inv_loc_data_dev(inv_loc_data_ptr); - inv_loc_data_dev[0] = 0; - thrust::inclusive_scan(thrust::device, - inv_loc_data_ptr, - inv_loc_data_ptr + row, - inv_loc_data_ptr); - thrust::scatter(thrust::device, - inv_loc_data_ptr, - inv_loc_data_ptr + row, - sorted_indices_data, - inverse_data); - - // 2. sorted indices - Tensor range; - range.Resize(phi::make_ddim({row + 1})); - auto range_data_ptr = range.mutable_data(context.GetPlace()); - thrust::sequence(thrust::device, range_data_ptr, range_data_ptr + row + 1); - int num_out; - num_out = thrust::unique_by_key(thrust::device, - sorted_indices_data, - sorted_indices_data + row, - range_data_ptr, - equal) - .first - - sorted_indices_data; - thrust::device_ptr range_data_ptr_dev(range_data_ptr); - range_data_ptr_dev[num_out] = row; - sorted_indices->Resize(phi::make_ddim({num_out})); - - // 3. counts: 'counts' - Tensor* counts = context.Output("Counts"); - counts->Resize(phi::make_ddim({num_out})); - auto count_data = counts->mutable_data(context.GetPlace()); - thrust::fill(thrust::device, count_data, count_data + row, 0); - thrust::adjacent_difference( - thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data); -} - // Calculate unique consecutive when 'axis' is set -template -static void UniqueConsecutiveDimsCUDATensor( - const framework::ExecutionContext& context, - const Tensor& in, - Tensor* out, - bool return_inverse, - bool return_counts, - int axis) { +template +static void UniqueConsecutiveDimsCUDATensor(const Context& context, + const DenseTensor& in, + DenseTensor* out, + bool return_inverse, + bool return_counts, + int axis, + DenseTensor* inverse, + DenseTensor* counts) { // 1. Transpose & reshape // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] std::vector permute(in.dims().size()); @@ -304,19 +354,18 @@ static void UniqueConsecutiveDimsCUDATensor( std::vector in_trans_dims_vec(phi::vectorize(in.dims())); in_trans_dims_vec[axis] = in.dims()[0]; in_trans_dims_vec[0] = in.dims()[axis]; - framework::Tensor in_trans; - framework::DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec); + DenseTensor in_trans; + DDim in_trans_dims = phi::make_ddim(in_trans_dims_vec); in_trans.Resize(in_trans_dims); - in_trans.mutable_data(context.GetPlace()); - auto& dev_ctx = context.cuda_device_context(); - TransCompute(in.dims().size(), // num of dims - dev_ctx, // device - in, // original Tensor - &in_trans, // Tensor after reshape - permute); // index of axis + context.template Alloc(&in_trans); + phi::funcs::TransCompute(in.dims().size(), // num of dims + context, // device + in, // original Tensor + &in_trans, // Tensor after reshape + permute); // index of axis // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] - framework::DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); + DDim in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1); in_trans.Resize(in_trans_flat_dims); // now 'in_trans' is 2D @@ -324,16 +373,15 @@ static void UniqueConsecutiveDimsCUDATensor( int64_t row = in_trans.dims()[0]; const InT* in_trans_data = in_trans.data(); - Tensor sorted_indices; + DenseTensor sorted_indices; sorted_indices.Resize(phi::make_ddim({row})); - auto sorted_indices_data = - sorted_indices.mutable_data(context.GetPlace()); + auto sorted_indices_data = context.template Alloc(&sorted_indices); // 2. Calculate 'inverse', 'counts' // Init index thrust::sequence( thrust::device, sorted_indices_data, sorted_indices_data + row); - ComputeUniqueConsecutiveDims( + ComputeUniqueConsecutiveDims( context, &sorted_indices, sorted_indices_data, @@ -342,143 +390,70 @@ static void UniqueConsecutiveDimsCUDATensor( return_counts, BinaryEqual(col, in_trans_data), BinaryNotEqual(col, in_trans_data), - row); + row, + inverse, + counts); // 3. Select indices and reshape back to get 'out' - Tensor out_trans; + DenseTensor out_trans; std::vector out_trans_dims_vec = in_trans_dims_vec; out_trans_dims_vec[0] = sorted_indices.numel(); out_trans.Resize(phi::make_ddim(out_trans_dims_vec)); - out_trans.mutable_data(context.GetPlace()); + context.template Alloc(&out_trans); - IndexSelect(context, in_trans, sorted_indices, &out_trans, 0); + IndexSelect( + context, in_trans, sorted_indices, &out_trans, 0); std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); out->Resize(phi::make_ddim(out_trans_dims_vec)); - out->mutable_data(context.GetPlace()); - std::vector out_trans_unbind = Unbind(out_trans); - math::ConcatFunctor concat_functor; - concat_functor(dev_ctx, out_trans_unbind, 0, &out_trans); - TransCompute( - out_trans.dims().size(), dev_ctx, out_trans, out, permute); + context.template Alloc(out); + std::vector out_trans_unbind = phi::funcs::Unbind(out_trans); + phi::funcs::ConcatFunctor concat_functor; + concat_functor(context, out_trans_unbind, 0, &out_trans); + phi::funcs::TransCompute( + out_trans.dims().size(), context, out_trans, out, permute); } -// functor for processing a flattend Tensor -template -struct UniqueConsecutiveFlattendCUDAFunctor { - const framework::ExecutionContext& ctx_; - const Tensor& in_; - Tensor* out_; - const bool return_inverse_; - const bool return_counts_; - - UniqueConsecutiveFlattendCUDAFunctor( - const framework::ExecutionContext& context, - const Tensor& in, - Tensor* out, - bool return_inverse, - bool return_counts) - : ctx_(context), - in_(in), - out_(out), - return_inverse_(return_inverse), - return_counts_(return_counts) {} - - template - void apply() const { - UniqueConsecutiveFlattendCUDATensor( - ctx_, - in_, - out_, - return_inverse_, - return_counts_, - thrust::equal_to(), - thrust::not_equal_to(), - in_.numel()); - } -}; - // functor for processing a multi-dimentional Tensor -template +template struct UniqueConsecutiveDimsCUDAFunctor { - const framework::ExecutionContext& ctx_; - const Tensor& in_; - Tensor* out_; + const Context& ctx_; + const DenseTensor& in_; + DenseTensor* out_; const int axis_; const bool return_inverse_; const bool return_counts_; + DenseTensor* inverse_; + DenseTensor* count_; - UniqueConsecutiveDimsCUDAFunctor(const framework::ExecutionContext& context, - const Tensor& in, - Tensor* out, + UniqueConsecutiveDimsCUDAFunctor(const Context& context, + const DenseTensor& in, + DenseTensor* out, const int axis, bool return_inverse, - bool return_counts) + bool return_counts, + DenseTensor* inverse, + DenseTensor* count) : ctx_(context), in_(in), out_(out), axis_(axis), return_inverse_(return_inverse), - return_counts_(return_counts) {} + return_counts_(return_counts), + inverse_(inverse), + count_(count) {} template void apply() const { - UniqueConsecutiveDimsCUDATensor( - ctx_, in_, out_, return_inverse_, return_counts_, axis_); + UniqueConsecutiveDimsCUDATensor(ctx_, + in_, + out_, + return_inverse_, + return_counts_, + axis_, + inverse_, + count_); } }; -// Unique_Consecutive_op CUDA implementation. -template -class UniqueConsecutiveKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - auto data_type = static_cast( - context.Attr("dtype")); - if (data_type == framework::proto::VarType::INT32) { - PADDLE_ENFORCE_LE( - x->numel() + 1, - INT_MAX, - platform::errors::InvalidArgument( - "The number of elements in Input(X) should be less than or " - "equal to INT_MAX, but received num is %d. Please set `dtype` to " - "int64.", - x->numel())); - } - - std::vector axis_vec = context.Attr>("axis"); - bool return_inverse = context.Attr("return_inverse"); - bool return_counts = context.Attr("return_counts"); - - // if 'axis' is not required, flatten the Tensor. - if (axis_vec.empty()) { - framework::VisitDataTypeTiny( - data_type, - UniqueConsecutiveFlattendCUDAFunctor( - context, *x, out, return_inverse, return_counts)); - } else { - // 'axis' is required. - int axis = axis_vec[0]; - framework::VisitDataTypeTiny( - data_type, - UniqueConsecutiveDimsCUDAFunctor( - context, *x, out, axis, return_inverse, return_counts)); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - unique_consecutive, - ops::UniqueConsecutiveKernel, - ops::UniqueConsecutiveKernel, - ops::UniqueConsecutiveKernel, - ops::UniqueConsecutiveKernel); +} // namespace phi diff --git a/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu new file mode 100644 index 0000000000000..4ce91a0dd66b4 --- /dev/null +++ b/paddle/phi/kernels/gpu/unique_consecutive_kernel.cu @@ -0,0 +1,81 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/unique_consecutive_kernel.h" +#include "paddle/phi/kernels/gpu/unique_consecutive_functor.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/errors.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/data_type.h" + +namespace phi { + +template +void UniqueConsecutiveKernel(const Context& dev_ctx, + const DenseTensor& x, + bool return_inverse, + bool return_counts, + const std::vector& axis, + int dtype, + DenseTensor* out, + DenseTensor* index, + DenseTensor* counts) { + auto data_type = static_cast(dtype); + if (data_type == paddle::framework::proto::VarType::INT32) { + PADDLE_ENFORCE_LE( + x.numel() + 1, + INT_MAX, + phi::errors::InvalidArgument( + "The number of elements in Input(X) should be less than or " + "equal to INT_MAX, but received num is %d. Please set `dtype` to " + "int64.", + x.numel())); + } + + // if 'axis' is not required, flatten the Tensor. + if (axis.empty()) { + paddle::framework::VisitDataTypeTiny( + data_type, + UniqueConsecutiveFlattenedCUDAFunctor( + dev_ctx, x, out, return_inverse, return_counts, index, counts)); + } else { + // 'axis' is required. + int valid_axis = axis[0]; + paddle::framework::VisitDataTypeTiny( + data_type, + UniqueConsecutiveDimsCUDAFunctor(dev_ctx, + x, + out, + valid_axis, + return_inverse, + return_counts, + index, + counts)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(unique_consecutive, + GPU, + ALL_LAYOUT, + phi::UniqueConsecutiveKernel, + float, + double, + int32_t, + int64_t) {} diff --git a/paddle/phi/kernels/unique_consecutive_kernel.h b/paddle/phi/kernels/unique_consecutive_kernel.h new file mode 100644 index 0000000000000..ade35d4d49730 --- /dev/null +++ b/paddle/phi/kernels/unique_consecutive_kernel.h @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void UniqueConsecutiveKernel(const Context& dev_ctx, + const DenseTensor& x, + bool return_inverse, + bool return_counts, + const std::vector& axis, + int dtype, + DenseTensor* out, + DenseTensor* index, + DenseTensor* counts); + +} // namespace phi diff --git a/paddle/phi/ops/compat/unique_consecutive_sig.cc b/paddle/phi/ops/compat/unique_consecutive_sig.cc new file mode 100644 index 0000000000000..f085858d8cb0d --- /dev/null +++ b/paddle/phi/ops/compat/unique_consecutive_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature UniqueConsecutiveOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("unique_consecutive", + {"X"}, + {"return_inverse", "return_counts", "axis", "dtype"}, + {"Out", "Index", "Counts"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(unique_consecutive, + phi::UniqueConsecutiveOpArgumentMapping); From 6cd79701ee27ab8c97b5309544369b98b4f23214 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Tue, 12 Jul 2022 10:51:20 +0800 Subject: [PATCH 132/250] [Eager] split coed gen for eager fluid_generated (#44177) * split coed gen for eager fluid_generated --- .gitignore | 3 +- .../eager/auto_code_generator/CMakeLists.txt | 64 ++--------- .../auto_code_generator/eager_generator.cc | 101 +++++++++++++----- .../final_state_generator/CMakeLists.txt | 5 +- .../final_state_generator/python_c_gen.py | 38 ++++--- .../generate_file_structures.py | 64 +++++++++-- paddle/fluid/pybind/.gitignore | 7 +- paddle/fluid/pybind/CMakeLists.txt | 80 ++++++++------ paddle/fluid/pybind/eager.cc | 2 +- paddle/fluid/pybind/eager.h | 3 +- paddle/fluid/pybind/eager_custom_python_api.h | 55 ++-------- .../eager_final_state_custom_python_api.h | 73 +++++++++++++ .../pybind/eager_op_function_generator.cc | 31 +++--- .../fluid/pybind/generate_file_structures.py | 28 +++++ paddle/fluid/pybind/op_function.h | 5 +- paddle/fluid/pybind/op_function_generator.cc | 8 +- 16 files changed, 340 insertions(+), 227 deletions(-) create mode 100644 paddle/fluid/pybind/eager_final_state_custom_python_api.h create mode 100644 paddle/fluid/pybind/generate_file_structures.py diff --git a/.gitignore b/.gitignore index 2c486ec96f106..74cf6b8ab0230 100644 --- a/.gitignore +++ b/.gitignore @@ -65,8 +65,7 @@ paddle/infrt/dialect/pd/common/pd_ops_info.h paddle/infrt/tests/dialect/Output paddle/infrt/tests/lit.cfg.py paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc -paddle/fluid/pybind/eager_final_state_op_function_impl.h -paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h +paddle/fluid/pybind/eager_final_state_op_function.cc # these files (directories) are generated before build system generation paddle/fluid/operators/generated_op.cc diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt index ecfb40e947f91..162801c716962 100644 --- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt @@ -26,36 +26,14 @@ endif() message( "Generate dygraph file structure at path: ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/generated" ) + +set(CODE_GEN_SPLIT_FILE_COUNT "8") + execute_process( COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/generate_file_structures.py" - "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/") - -set(tmp_dygraph_forward_h_path - "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.tmp.h" -) -set(tmp_dygraph_forward_cc_path - "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.tmp.cc" -) -set(tmp_dygraph_node_h_path - "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.h" -) -set(tmp_dygraph_node_cc_path - "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.cc" -) -set(dygraph_forward_h_path - "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" -) -set(dygraph_forward_cc_path - "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions.cc" -) -set(dygraph_node_h_path - "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h" -) -set(dygraph_node_cc_path - "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.cc" -) + "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/" "${CODE_GEN_SPLIT_FILE_COUNT}") if(WIN32) set(EAGER_CODEGEN_DEPS eager_generator) @@ -114,22 +92,7 @@ if(WIN32) COMMAND "${eager_generator_path}/eager_generator.exe" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path} - ${dygraph_forward_h_path} - COMMENT - "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path} - ${dygraph_forward_cc_path} - COMMENT - "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path} - ${dygraph_node_h_path} - COMMENT - "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path} - ${dygraph_node_cc_path} - COMMENT - "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}" + "${CODE_GEN_SPLIT_FILE_COUNT}" DEPENDS ${EAGER_CODEGEN_DEPS} VERBATIM) else() @@ -140,22 +103,7 @@ else() "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${CMAKE_CURRENT_BINARY_DIR}/../../pybind" "${CMAKE_CURRENT_BINARY_DIR}/eager_generator" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_h_path} - ${dygraph_forward_h_path} - COMMENT - "copy_if_different ${tmp_dygraph_forward_h_path} to ${dygraph_forward_h_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_forward_cc_path} - ${dygraph_forward_cc_path} - COMMENT - "copy_if_different ${tmp_dygraph_forward_cc_path} to ${dygraph_forward_cc_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_h_path} - ${dygraph_node_h_path} - COMMENT - "copy_if_different ${tmp_dygraph_node_h_path} to ${dygraph_node_h_path}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_dygraph_node_cc_path} - ${dygraph_node_cc_path} - COMMENT - "copy_if_different ${tmp_dygraph_node_cc_path} to ${dygraph_node_cc_path}" + "${CODE_GEN_SPLIT_FILE_COUNT}" DEPENDS eager_generator VERBATIM) endif() diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 1b3c7fd8e4649..4f5efe74fa9a6 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -3108,7 +3108,8 @@ static std::string GenerateCoreOpsReturnsInfo() { return core_ops_info_str; } -static void DygraphCodeGeneration(const std::string& output_dir) { +static void DygraphCodeGeneration(const std::string& output_dir, + int split_count) { std::string dygraph_forward_api_str = GenerateDygraphHFileIncludes(); std::string fwd_function_str = ""; std::string grad_node_h_str = ""; @@ -3116,6 +3117,8 @@ static void DygraphCodeGeneration(const std::string& output_dir) { auto& op_info_map = paddle::framework::OpInfoMap::Instance().map(); + paddle::flat_hash_map op_info_map_need_gen; + for (auto& pair : op_info_map) { const OpInfo& op_info = pair.second; proto::OpProto* op_proto = op_info.proto_; @@ -3126,6 +3129,31 @@ static void DygraphCodeGeneration(const std::string& output_dir) { continue; } + GradNodeGenerationInfo bwd_info; + + bool is_available = CollectGradInformationFromOpInfo(op_info, &bwd_info); + + if (!is_available && !bwd_info.GenerateForwardOnly()) { + VLOG(6) << "Skipped operator: " << op_type; + continue; + } + + op_info_map_need_gen.emplace(pair); + } + + int each_cc_file_api_size = op_info_map_need_gen.size() / split_count; + if (op_info_map_need_gen.size() % split_count != 0) { + each_cc_file_api_size++; + } + int api_index = 0; + int file_index = 0; + + for (auto& pair : op_info_map_need_gen) { + const OpInfo& op_info = pair.second; + proto::OpProto* op_proto = op_info.proto_; + + const std::string& op_type = op_proto->type(); + /* ----------------------------- */ /* ---- Collect Information ---- */ /* ----------------------------- */ @@ -3137,12 +3165,7 @@ static void DygraphCodeGeneration(const std::string& output_dir) { CollectForwardInformationFromOpInfo(op_info, &fwd_info); - bool is_available = CollectGradInformationFromOpInfo(op_info, &bwd_info); - - if (!is_available && !bwd_info.GenerateForwardOnly()) { - VLOG(6) << "Skipped operator: " << op_type; - continue; - } + CollectGradInformationFromOpInfo(op_info, &bwd_info); VLOG(6) << "-------- PurifyOpProto -------"; PurifyForwardOpProto(*op_proto, &fwd_info); @@ -3188,25 +3211,54 @@ static void DygraphCodeGeneration(const std::string& output_dir) { dygraph_forward_api_str += inplace_fwd_function_declare_str; } - if (bwd_info.GenerateForwardOnly()) continue; - - VLOG(6) << "-------- GenerateGradNodeHeaderContents -------"; - grad_node_h_str += GenerateGradNodeHeaderContents(fwd_info, bwd_info); - grad_node_h_str += "\n"; + if (!bwd_info.GenerateForwardOnly()) { + VLOG(6) << "-------- GenerateGradNodeHeaderContents -------"; + grad_node_h_str += GenerateGradNodeHeaderContents(fwd_info, bwd_info); + grad_node_h_str += "\n"; - VLOG(6) << "-------- GenerateGradNodeCCContents -------"; - grad_node_cc_str += GenerateGradNodeCCContents(fwd_info, bwd_info); - grad_node_cc_str += "\n"; + VLOG(6) << "-------- GenerateGradNodeCCContents -------"; + grad_node_cc_str += GenerateGradNodeCCContents(fwd_info, bwd_info); + grad_node_cc_str += "\n"; + } VLOG(6) << op_type << ": Finished Generating Op: " << op_type; + + api_index++; + if (api_index / each_cc_file_api_size > file_index) { + file_index++; + VLOG(6) << "-------- GenerateDygraphForwardCCFile -------"; + std::string forward_cc_path = output_dir + + "/forwards/dygraph_forward_functions" + + std::to_string(file_index) + ".tmp.cc"; + fwd_function_str += "\n"; + GenerateForwardDygraphFile(forward_cc_path, fwd_function_str); + fwd_function_str = ""; + + VLOG(6) << "-------- GenerateNodeCCFile -------"; + std::string node_cc_path = + output_dir + "/nodes/nodes" + std::to_string(file_index) + ".tmp.cc"; + GenerateNodeCCFile(node_cc_path, grad_node_cc_str); + grad_node_cc_str = ""; + } } + file_index++; VLOG(6) << "-------- GenerateDygraphForwardCCFile -------"; - std::string forward_cc_path = - output_dir + "/forwards/dygraph_forward_functions.tmp.cc"; - fwd_function_str += "\n"; - fwd_function_str += GenerateCoreOpsReturnsInfo(); + std::string forward_cc_path = output_dir + + "/forwards/dygraph_forward_functions" + + std::to_string(file_index) + ".tmp.cc"; GenerateForwardDygraphFile(forward_cc_path, fwd_function_str); + fwd_function_str = ""; + + GenerateForwardDygraphFile( + output_dir + "/forwards/dygraph_forward_functions_args_info.tmp.cc", + GenerateCoreOpsReturnsInfo()); + + VLOG(6) << "-------- GenerateNodeCCFile -------"; + std::string node_cc_path = + output_dir + "/nodes/nodes" + std::to_string(file_index) + ".tmp.cc"; + GenerateNodeCCFile(node_cc_path, grad_node_cc_str); + grad_node_cc_str = ""; VLOG(6) << "-------- GenerateForwardHFile -------"; std::string dygraph_forward_api_path = @@ -3216,26 +3268,23 @@ static void DygraphCodeGeneration(const std::string& output_dir) { VLOG(6) << "-------- GenerateNodeHFile -------"; std::string node_h_path = output_dir + "/nodes/nodes.tmp.h"; GenerateNodeHFile(node_h_path, grad_node_h_str); - - VLOG(6) << "-------- GenerateNodeCCFile -------"; - std::string node_cc_path = output_dir + "/nodes/nodes.tmp.cc"; - GenerateNodeCCFile(node_cc_path, grad_node_cc_str); } } // namespace framework } // namespace paddle int main(int argc, char* argv[]) { - if (argc != 2) { - std::cerr << "argc must be 2" << std::endl; + if (argc != 3) { + std::cerr << "argc must be 3" << std::endl; return -1; } std::string eager_root = argv[1]; + int split_count = atoi(argv[2]); paddle::framework::PrepareAttrMapForOps(); - paddle::framework::DygraphCodeGeneration(eager_root); + paddle::framework::DygraphCodeGeneration(eager_root, split_count); return 0; } diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index 8967354d244aa..ce1e81dd971ad 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -54,11 +54,10 @@ add_custom_target( VERBATIM) set(tmp_python_c_output_path - "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/tmp_eager_final_state_op_function_impl.h" + "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function.cc.tmp" ) set(python_c_output_path - "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function_impl.h" -) + "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/eager_final_state_op_function.cc") add_custom_target( eager_final_state_python_c_codegen diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 66d8e8bfadab2..d1e7885bae4c1 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -139,22 +139,16 @@ def FindParsingFunctionFromAttributeType(atype): PYTHON_C_WRAPPER_TEMPLATE = \ """ -#pragma once - -#include "pybind11/detail/common.h" -#include "paddle/phi/api/all.h" -#include "paddle/phi/api/lib/dygraph_api.h" -#include "paddle/phi/common/backend.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/int_array.h" -#include "paddle/phi/api/include/sparse_api.h" -#include "paddle/phi/api/include/strings_api.h" -#include "paddle/fluid/pybind/op_function_common.h" -#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" -#include "paddle/fluid/pybind/exception.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" -#include +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/api/include/strings_api.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/pybind/op_function_common.h" +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" +#include "paddle/fluid/pybind/eager_final_state_custom_python_api.h" +#include "paddle/fluid/pybind/eager.h" namespace paddle {{ namespace pybind {{ @@ -165,6 +159,16 @@ def FindParsingFunctionFromAttributeType(atype): {} }}; +void BindFinalStateEagerOpFunctions(pybind11::module *module) {{ + if (PyModule_AddFunctions(module->ptr(), EagerFinalStateMethods) < 0) {{ + PADDLE_THROW(platform::errors::Fatal ("Add functions to core.eager.ops failed!")); + }} + + if (PyModule_AddFunctions(module->ptr(), CustomEagerFinalStateMethods) < 0) {{ + PADDLE_THROW(platform::errors::Fatal ("Add functions to core.eager.ops failed!")); + }} +}} + }} // namespace pybind }} // namespace paddle """ @@ -449,8 +453,8 @@ def __init__(self, path): def GeneratePythonCFunctions(self): namespace = self.namespace - forward_api_list = self.forward_api_list + forward_api_list = self.forward_api_list for forward_api_content in forward_api_list: f_generator = PythonCSingleFunctionGenerator( forward_api_content, namespace) diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py index a7cd1dc8c4673..d6574bc2e81fb 100644 --- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py +++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py @@ -53,7 +53,7 @@ def GenerateFileStructureForFinalDygraph(eager_dir): open(path, 'a').close() -def GenerateFileStructureForIntermediateDygraph(eager_dir): +def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count): """ paddle/fluid/eager |- generated @@ -86,11 +86,16 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir): dygraph_forward_api_h_path = os.path.join(generated_dir, "dygraph_forward_api.h") empty_files = [dygraph_forward_api_h_path] - empty_files.append( - os.path.join(forwards_dir, "dygraph_forward_functions.cc")) - empty_files.append(os.path.join(nodes_dir, "nodes.cc")) empty_files.append(os.path.join(nodes_dir, "nodes.h")) + for i in range(split_count): + empty_files.append( + os.path.join(forwards_dir, + "dygraph_forward_functions" + str(i + 1) + ".cc")) + empty_files.append(os.path.join(nodes_dir, + "nodes" + str(i + 1) + ".cc")) + empty_files.append( + os.path.join(forwards_dir, "dygraph_forward_functions_args_info.cc")) for path in empty_files: if not os.path.exists(path): open(path, 'a').close() @@ -102,23 +107,62 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir): forwards_level_cmakelist_path = os.path.join(forwards_dir, "CMakeLists.txt") with open(nodes_level_cmakelist_path, "w") as f: + f.write("add_custom_target(\n") + f.write(" copy_dygraph_node\n") f.write( - "cc_library(dygraph_node SRCS nodes.cc DEPS ${eager_deps} ${fluid_deps} ${fluid_manual_nodes})\n" + " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.tmp.h\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n" ) - f.write("add_dependencies(dygraph_node eager_codegen)") + for i in range(split_count): + f.write( + " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes" + + str(i + 1) + + ".tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes" + + str(i + 1) + ".cc\"\n") + + f.write(" DEPENDS eager_codegen\n") + f.write(" VERBATIM)\n") + + f.write("cc_library(dygraph_node SRCS ") + for i in range(split_count): + f.write("nodes" + str(i + 1) + ".cc ") + f.write("DEPS ${eager_deps} ${fluid_deps} ${fluid_manual_nodes})\n") + f.write("add_dependencies(dygraph_node copy_dygraph_node)") with open(forwards_level_cmakelist_path, "w") as f: + f.write("add_custom_target(\n") + f.write(" copy_dygraph_forward_functions\n") f.write( - "cc_library(dygraph_function SRCS dygraph_forward_functions.cc DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${fluid_manual_functions})\n" + " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.tmp.h\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h\"\n" ) - f.write("add_dependencies(dygraph_function eager_codegen)") + for i in range(split_count): + f.write( + " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions" + + str(i + 1) + + ".tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions" + + str(i + 1) + ".cc\"\n") + f.write( + " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.cc\"\n" + ) + f.write(" DEPENDS eager_codegen\n") + f.write(" VERBATIM)\n") + + f.write("cc_library(dygraph_function SRCS ") + for i in range(split_count): + f.write("dygraph_forward_functions" + str(i + 1) + ".cc ") + f.write("dygraph_forward_functions_args_info.cc ") + f.write( + "DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${fluid_manual_functions})\n" + ) + f.write( + "add_dependencies(dygraph_function copy_dygraph_forward_functions)") with open(generated_level_cmakelist_path, "w") as f: f.write("add_subdirectory(forwards)\nadd_subdirectory(nodes)") if __name__ == "__main__": - assert len(sys.argv) == 2 + assert len(sys.argv) == 3 eager_dir = sys.argv[1] - GenerateFileStructureForIntermediateDygraph(eager_dir) + split_count = int(sys.argv[2]) + GenerateFileStructureForIntermediateDygraph(eager_dir, split_count) GenerateFileStructureForFinalDygraph(eager_dir) diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore index 6869b6841a8a6..bd45f1ec2ea30 100644 --- a/paddle/fluid/pybind/.gitignore +++ b/paddle/fluid/pybind/.gitignore @@ -1,5 +1,4 @@ pybind.h -op_function_impl.h -eager_op_function_impl.h -eager_final_state_op_function_impl.h -tmp_eager_final_state_op_function_impl.h +op_function.cc +eager_op_function.cc +eager_final_state_op_function.cc diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b2ecf36c5d227..d5c7bcc30d176 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -101,11 +101,16 @@ endif() set(PYBIND_SRCS pybind.cc - exception.cc + imperative.cc + op_function.cc + inference_api.cc + ir.cc + bind_fleet_executor.cc + reader_py.cc protobuf.cc + exception.cc const_value.cc global_value_getter_setter.cc - reader_py.cc fleet_wrapper_py.cc heter_wrapper_py.cc ps_gpu_wrapper_py.cc @@ -113,11 +118,7 @@ set(PYBIND_SRCS box_helper_py.cc metrics_py.cc data_set_py.cc - imperative.cc - ir.cc bind_cost_model.cc - bind_fleet_executor.cc - inference_api.cc compatible.cc io.cc generator_py.cc @@ -125,6 +126,12 @@ set(PYBIND_SRCS cuda_streams_py.cc jit.cc) +execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" + "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/generate_file_structures.py" + "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/") + if(WITH_CUSTOM_DEVICE) set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi) endif() @@ -189,7 +196,8 @@ if(WITH_PSCORE) set_source_files_properties( fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) list(APPEND PYBIND_DEPS fleet communicator index_wrapper index_sampler) - list(APPEND PYBIND_SRCS fleet_py.cc) + list(APPEND PYBIND_SRCS) + set(PYBIND_SRCS fleet_py.cc ${PYBIND_SRCS}) endif() if(WITH_NCCL OR WITH_RCCL) @@ -259,10 +267,10 @@ if(WITH_PYTHON) target_link_libraries(kernel_signature_generator ${ROCM_HIPRTC_LIB}) endif() - set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h) + set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function.cc) set(tmp_impl_file ${impl_file}.tmp) set(eager_impl_file - ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function_impl.h) + ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function.cc) set(tmp_eager_impl_file ${eager_impl_file}.tmp) set(OP_IMPL_DEPS op_function_generator) @@ -461,30 +469,31 @@ if(WITH_PYTHON) list(APPEND PYBIND_DEPS op_function_common) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) - cc_library( - paddle_eager - SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc - eager_utils.cc eager_py_layer.cc - DEPS eager_api - autograd_meta - backward - grad_node_info - phi - op_function_common - final_dygraph_function - final_dygraph_node - dygraph_function - dygraph_node - accumulation_node - py_layer_node - global_utils - utils - python - custom_operator - custom_operator_node) - add_dependencies(paddle_eager eager_codegen) - add_dependencies(paddle_eager eager_op_function_generator_cmd) - list(APPEND PYBIND_DEPS paddle_eager) + set(PYBIND_SRCS eager.cc ${PYBIND_SRCS}) + set(PYBIND_SRCS eager_functions.cc ${PYBIND_SRCS}) + set(PYBIND_SRCS eager_method.cc ${PYBIND_SRCS}) + set(PYBIND_SRCS eager_properties.cc ${PYBIND_SRCS}) + set(PYBIND_SRCS eager_utils.cc ${PYBIND_SRCS}) + set(PYBIND_SRCS eager_py_layer.cc ${PYBIND_SRCS}) + set(PYBIND_SRCS eager_op_function.cc ${PYBIND_SRCS}) + set(PYBIND_SRCS eager_final_state_op_function.cc ${PYBIND_SRCS}) + list(APPEND PYBIND_DEPS eager_api) + list(APPEND PYBIND_DEPS autograd_meta) + list(APPEND PYBIND_DEPS backward) + list(APPEND PYBIND_DEPS grad_node_info) + list(APPEND PYBIND_DEPS phi) + list(APPEND PYBIND_DEPS op_function_common) + list(APPEND PYBIND_DEPS final_dygraph_function) + list(APPEND PYBIND_DEPS final_dygraph_node) + list(APPEND PYBIND_DEPS dygraph_function) + list(APPEND PYBIND_DEPS dygraph_node) + list(APPEND PYBIND_DEPS accumulation_node) + list(APPEND PYBIND_DEPS py_layer_node) + list(APPEND PYBIND_DEPS global_utils) + list(APPEND PYBIND_DEPS utils) + list(APPEND PYBIND_DEPS python) + list(APPEND PYBIND_DEPS custom_operator) + list(APPEND PYBIND_DEPS custom_operator_node) endif() cc_library( @@ -492,6 +501,11 @@ if(WITH_PYTHON) SRCS ${PYBIND_SRCS} DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) + add_dependencies(paddle_pybind eager_codegen) + add_dependencies(paddle_pybind eager_op_function_generator_cmd) + endif() + if(NOT APPLE AND NOT WIN32) target_link_libraries(paddle_pybind rt) endif() diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index f436d0e96b5dc..03aace9b78e38 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -33,7 +33,7 @@ limitations under the License. */ #include "pybind11/pybind11.h" #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #include "paddle/fluid/framework/python_headers.h" -#include "paddle/fluid/pybind/eager_op_function_impl.h" +#include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/core/string_tensor.h" diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h index db2b438c3bd94..5560744ae1d49 100644 --- a/paddle/fluid/pybind/eager.h +++ b/paddle/fluid/pybind/eager.h @@ -40,6 +40,7 @@ void BindEager(pybind11::module* m); void BindEagerStringTensor(pybind11::module* module); void BindFunctions(PyObject* module); void BindEagerPyLayer(PyObject* module); - +void BindEagerOpFunctions(pybind11::module* module); +void BindFinalStateEagerOpFunctions(pybind11::module* module); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h index 86586123ee46c..7ed58a1e956f6 100644 --- a/paddle/fluid/pybind/eager_custom_python_api.h +++ b/paddle/fluid/pybind/eager_custom_python_api.h @@ -15,8 +15,12 @@ #include +#include "paddle/fluid/eager/to_static/run_program_op_func.h" #include "paddle/phi/core/enforce.h" +namespace paddle { +namespace pybind { + static PyObject *eager_api_run_program(PyObject *self, PyObject *args, PyObject *kwargs) { @@ -57,55 +61,12 @@ static PyObject *eager_api_run_program(PyObject *self, } } -static PyObject *eager_api_final_state_linear(PyObject *self, - PyObject *args, - PyObject *kwargs) { - PyThreadState *tstate = nullptr; - try { - auto x = GetTensorFromArgs("linear", "X", args, 0, false); - auto weight = GetTensorFromArgs("linear", "weight", args, 1, false); - auto bias = GetTensorFromArgs("linear", "Bias", args, 2, true); - tstate = PyEval_SaveThread(); - if (bias.initialized()) { - auto mm_out = - matmul_final_state_dygraph_function(x, weight, false, false); - auto out = add_final_state_dygraph_function(mm_out, bias); - PyEval_RestoreThread(tstate); - tstate = nullptr; - return ToPyObject(out); - } else { - auto mm_out = - matmul_final_state_dygraph_function(x, weight, false, false); - PyEval_RestoreThread(tstate); - tstate = nullptr; - return ToPyObject(mm_out); - } - } catch (paddle::platform::EnforceNotMet &exception) { - if (tstate) { - PyEval_RestoreThread(tstate); - } - std::ostringstream sout; - sout << exception.what(); - sout << " [operator < linear > error]"; - exception.set_error_str(sout.str()); - ThrowExceptionToPython(std::current_exception()); - return nullptr; - } catch (...) { - if (tstate) { - PyEval_RestoreThread(tstate); - } - ThrowExceptionToPython(std::current_exception()); - return nullptr; - } -} - -static PyMethodDef CustomEagerFinalStateMethods[] = { +static PyMethodDef CustomEagerMethods[] = { {"run_program", (PyCFunction)(void (*)(void))eager_api_run_program, METH_VARARGS | METH_KEYWORDS, "C++ interface function for run_program in dygraph."}, - {"final_state_linear", - (PyCFunction)(void (*)(void))eager_api_final_state_linear, - METH_VARARGS | METH_KEYWORDS, - "C++ interface function for run_program in dygraph."}, {nullptr, nullptr, 0, nullptr}}; + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/eager_final_state_custom_python_api.h b/paddle/fluid/pybind/eager_final_state_custom_python_api.h new file mode 100644 index 0000000000000..4774b33a722d5 --- /dev/null +++ b/paddle/fluid/pybind/eager_final_state_custom_python_api.h @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include "paddle/phi/core/enforce.h" + +namespace paddle { +namespace pybind { + +static PyObject *eager_api_final_state_linear(PyObject *self, + PyObject *args, + PyObject *kwargs) { + PyThreadState *tstate = nullptr; + try { + auto x = GetTensorFromArgs("linear", "X", args, 0, false); + auto weight = GetTensorFromArgs("linear", "weight", args, 1, false); + auto bias = GetTensorFromArgs("linear", "Bias", args, 2, true); + tstate = PyEval_SaveThread(); + if (bias.initialized()) { + auto mm_out = + matmul_final_state_dygraph_function(x, weight, false, false); + auto out = add_final_state_dygraph_function(mm_out, bias); + PyEval_RestoreThread(tstate); + tstate = nullptr; + return ToPyObject(out); + } else { + auto mm_out = + matmul_final_state_dygraph_function(x, weight, false, false); + PyEval_RestoreThread(tstate); + tstate = nullptr; + return ToPyObject(mm_out); + } + } catch (paddle::platform::EnforceNotMet &exception) { + if (tstate) { + PyEval_RestoreThread(tstate); + } + std::ostringstream sout; + sout << exception.what(); + sout << " [operator < linear > error]"; + exception.set_error_str(sout.str()); + ThrowExceptionToPython(std::current_exception()); + return nullptr; + } catch (...) { + if (tstate) { + PyEval_RestoreThread(tstate); + } + ThrowExceptionToPython(std::current_exception()); + return nullptr; + } +} + +static PyMethodDef CustomEagerFinalStateMethods[] = { + {"final_state_linear", + (PyCFunction)(void (*)(void))eager_api_final_state_linear, + METH_VARARGS | METH_KEYWORDS, + "C++ interface function for run_program in dygraph."}, + {nullptr, nullptr, 0, nullptr}}; + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc index 7d84124a264a0..72c12b267d1c9 100644 --- a/paddle/fluid/pybind/eager_op_function_generator.cc +++ b/paddle/fluid/pybind/eager_op_function_generator.cc @@ -138,8 +138,6 @@ const char* PYBIND_ITEM_TEMPLATE = R"( {"%s", (PyCFunction)(void(*)(void))%s, M // These operators will skip automatical code generatrion and // need to be handwritten in CUSTOM_HANDWRITE_OP_FUNC_FILE std::unordered_set CUSTOM_HANDWRITE_OPS_SET = {"run_program"}; -const char* CUSTOM_HANDWRITE_OP_FUNC_FILE = - "#include \"paddle/fluid/pybind/eager_custom_python_api.h\"\n"; // clang-format on static inline bool FindInsMap(const std::string& op_type, @@ -413,7 +411,6 @@ GenerateOpFunctions() { std::vector op_function_list, bind_function_list; auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels(); - bool append_custom_head_file = false; for (auto& pair : op_info_map) { auto& op_info = pair.second; auto op_proto = op_info.proto_; @@ -423,7 +420,6 @@ GenerateOpFunctions() { auto& op_type = op_proto->type(); // Skip operators that will be handwriten in CUSTOM_HANDWRITE_OP_FUNC_FILE. if (CUSTOM_HANDWRITE_OPS_SET.count(op_type)) { - append_custom_head_file = true; continue; } // Skip operator which is not inherit form OperatorWithKernel, like while, @@ -480,9 +476,7 @@ GenerateOpFunctions() { bind_function_list.emplace_back(std::move(inplace_bind_function_str)); } } - if (append_custom_head_file) { - op_function_list.emplace_back(CUSTOM_HANDWRITE_OP_FUNC_FILE); - } + return std::make_tuple(op_function_list, bind_function_list); } @@ -498,18 +492,19 @@ int main(int argc, char* argv[]) { #endif std::vector headers{ - "\"pybind11/detail/common.h\"", - "\"paddle/fluid/pybind/eager_final_state_op_function_impl.h\"", - "\"paddle/fluid/pybind/op_function_common.h\"", + "", + "\"paddle/fluid/platform/enforce.h\"", "\"paddle/fluid/eager/api/generated/fluid_generated/" "dygraph_forward_api.h\"", + "\"paddle/fluid/pybind/eager_utils.h\"", + "\"paddle/fluid/platform/profiler/event_tracing.h\"", "\"paddle/fluid/pybind/exception.h\"", - ""}; + "\"paddle/fluid/pybind/op_function_common.h\"", + "\"paddle/fluid/pybind/eager_custom_python_api.h\"", + "\"paddle/fluid/pybind/eager.h\""}; std::ofstream out(argv[1], std::ios::out); - out << "#pragma once\n\n"; - for (auto& header : headers) { out << "#include " + header + "\n"; } @@ -542,22 +537,20 @@ int main(int argc, char* argv[]) { << core_ops_infos_registry << "\n {nullptr,nullptr,0,nullptr}" << "};\n\n"; - out << "inline void BindEagerOpFunctions(pybind11::module *module) {\n" + out << "void BindEagerOpFunctions(pybind11::module *module) {\n" << " InitOpsAttrTypeMap();\n" << " auto m = module->def_submodule(\"ops\");\n" << " if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n" << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " "core.eager.ops failed!\"));\n" << " }\n\n" - << " if (PyModule_AddFunctions(m.ptr(), EagerFinalStateMethods) < 0) {\n" - << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " - "core.eager.ops failed!\"));\n" - << " }\n\n" - << " if (PyModule_AddFunctions(m.ptr(), CustomEagerFinalStateMethods) < " + << " if (PyModule_AddFunctions(m.ptr(), CustomEagerMethods) < " "0) {\n" << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " "core.eager.ops failed!\"));\n" << " }\n\n" + + << " BindFinalStateEagerOpFunctions(&m);\n\n" << "}\n\n" << "} // namespace pybind\n" << "} // namespace paddle\n"; diff --git a/paddle/fluid/pybind/generate_file_structures.py b/paddle/fluid/pybind/generate_file_structures.py new file mode 100644 index 0000000000000..391c47b8ee700 --- /dev/null +++ b/paddle/fluid/pybind/generate_file_structures.py @@ -0,0 +1,28 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os + +if __name__ == "__main__": + assert len(sys.argv) == 2 + pybind_dir = sys.argv[1] + + empty_files = [os.path.join(pybind_dir, "eager_final_state_op_function.cc")] + empty_files.append(os.path.join(pybind_dir, "eager_op_function.cc")) + empty_files.append(os.path.join(pybind_dir, "op_function.cc")) + + for path in empty_files: + if not os.path.exists(path): + open(path, 'a').close() diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h index 16c902cadf9a1..5038dd5e6c4e2 100644 --- a/paddle/fluid/pybind/op_function.h +++ b/paddle/fluid/pybind/op_function.h @@ -257,8 +257,7 @@ PyObject* MakeReturnPyObject(const std::tuple& out) { return result; } +void BindOpFunctions(pybind11::module* module); + } // namespace pybind } // namespace paddle - -// This include must be the last line -#include "paddle/fluid/pybind/op_function_impl.h" diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index b25ed3b5c5894..9ddf0e7083f44 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -506,13 +506,15 @@ int main(int argc, char* argv[]) { std::vector headers{"\"paddle/fluid/imperative/tracer.h\"", "\"paddle/fluid/platform/profiler.h\"", + "\"pybind11/numpy.h\"", + "\"pybind11/pybind11.h\"", "\"pybind11/detail/common.h\"", + "\"paddle/fluid/pybind/eager_utils.h\"", + "\"paddle/fluid/pybind/op_function.h\"", ""}; std::ofstream out(argv[1], std::ios::out); - out << "#pragma once\n\n"; - for (auto& header : headers) { out << "#include " + header + "\n"; } @@ -532,7 +534,7 @@ int main(int argc, char* argv[]) { << "\n {nullptr,nullptr,0,nullptr}" << "};\n\n"; - out << "inline void BindOpFunctions(pybind11::module *module) {\n" + out << "void BindOpFunctions(pybind11::module *module) {\n" << " auto m = module->def_submodule(\"ops\");\n" << " if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n" << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " From 1fd611064dd3e6db7bfc0aa60eb67a12bb390590 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Tue, 12 Jul 2022 11:18:03 +0800 Subject: [PATCH 133/250] [IPU] add more UTs 0/N (#44206) * add authors Co-authored-by: Allen Guo Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen * squash py changes 0/N Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen --- python/paddle/fluid/compiler.py | 2 +- .../fluid/tests/unittests/ipu/op_test_ipu.py | 72 +++--- .../ipu/test_affine_channel_op_ipu.py | 98 ++++++++ .../ipu/test_binary_cross_entropy_op_ipu.py | 101 ++++++++ .../tests/unittests/ipu/test_clip_op_ipu.py | 220 ++++++++++++++++++ .../ipu/test_conv2d_transpose_op_ipu.py | 162 +++++++++++++ .../tests/unittests/ipu/test_conv_op_ipu.py | 2 +- .../ipu/test_cross_entropy2_op_ipu.py | 30 +++ .../tests/unittests/ipu/test_cumsum_op_ipu.py | 30 +++ .../unittests/ipu/test_data_norm_op_ipu.py | 130 +++++++++++ .../tests/unittests/ipu/test_dist_op_ipu.py | 95 ++++++++ 11 files changed, 910 insertions(+), 32 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index 11ccd476b1b59..1f81afbed64d7 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -635,7 +635,7 @@ def patch_getter(self, item): if not isinstance(item, CacheKey): raise ValueError( 'type(item) should be CacheKey, but received %s' % - type_name(item)) + type(item).__name__) item_id = hash(item) self._recent_key = item_id if item_id not in self._caches or ipu_strategy.need_compile: diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py index 5f2a0d59bb8be..becaaa4173ae7 100644 --- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py @@ -216,42 +216,54 @@ def check(self, check_shape=False, output_dict=None): raise ValueError("output_dict is empty") cpu_fp32 = output_dict[ExecutionMode.CPU_FP32] ipu_fp32 = output_dict[ExecutionMode.IPU_FP32] - cpu_fp32 = np.asarray(cpu_fp32).astype(np.float32).flatten() - ipu_fp32 = np.asarray(ipu_fp32).astype(np.float32).flatten() - pass_check = np.allclose(ipu_fp32, - cpu_fp32, - rtol=self.rtol, - atol=self.atol) - if not pass_check: - max_atol = np.abs(ipu_fp32 - cpu_fp32).max() - cpu_fp32_abs = np.abs(cpu_fp32) - cpu_fp32_abs[cpu_fp32_abs == 0.0] = 1e-20 - max_rtol = (np.abs(ipu_fp32 - cpu_fp32) / cpu_fp32_abs).max() - raise AssertionError( - f"ipu_fp32 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}" - ) - - if check_shape: - self.assertTrue(cpu_fp32.shape == ipu_fp32.shape) - - if ExecutionMode.IPU_FP16 in output_dict.keys(): - ipu_fp16 = output_dict[ExecutionMode.IPU_FP16] - ipu_fp16 = np.asarray(ipu_fp16).astype(np.float32).flatten() - pass_check = np.allclose(ipu_fp16, - cpu_fp32, - rtol=self.rtol_fp16, - atol=self.atol_fp16) + if len(cpu_fp32) != len(ipu_fp32): + raise ValueError("different outputs number between ipu and cpu.") + for cpu_fp32_res, ipu_fp32_res in zip(cpu_fp32, ipu_fp32): + cpu_fp32_res = np.asarray(cpu_fp32_res).astype(np.float32).flatten() + ipu_fp32_res = np.asarray(ipu_fp32_res).astype(np.float32).flatten() + pass_check = np.allclose(ipu_fp32_res, + cpu_fp32_res, + rtol=self.rtol, + atol=self.atol) if not pass_check: - max_atol = np.abs(ipu_fp16 - cpu_fp32).max() - cpu_fp32_abs = np.abs(cpu_fp32) + max_atol = np.abs(ipu_fp32_res - cpu_fp32_res).max() + cpu_fp32_abs = np.abs(cpu_fp32_res) cpu_fp32_abs[cpu_fp32_abs == 0.0] = 1e-20 - max_rtol = (np.abs(ipu_fp16 - cpu_fp32) / cpu_fp32_abs).max() + max_rtol = (np.abs(ipu_fp32_res - cpu_fp32_res) / + cpu_fp32_abs).max() raise AssertionError( - f"ipu_fp16 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}" + f"ipu_fp32 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}" ) if check_shape: - self.assertTrue(ipu_fp16.shape == cpu_fp32.shape) + self.assertTrue(cpu_fp32_res.shape == ipu_fp32_res.shape) + + if ExecutionMode.IPU_FP16 in output_dict.keys(): + ipu_fp16 = output_dict[ExecutionMode.IPU_FP16] + if len(cpu_fp32) != len(ipu_fp16): + raise ValueError( + "different outputs number between ipu and cpu.") + for cpu_fp32_res, ipu_fp16_res in zip(cpu_fp32, ipu_fp16): + cpu_fp32_res = np.asarray(cpu_fp32_res).astype( + np.float32).flatten() + ipu_fp16_res = np.asarray(ipu_fp16_res).astype( + np.float32).flatten() + pass_check = np.allclose(ipu_fp16_res, + cpu_fp32_res, + rtol=self.rtol_fp16, + atol=self.atol_fp16) + if not pass_check: + max_atol = np.abs(ipu_fp16_res - cpu_fp32_res).max() + cpu_fp32_abs = np.abs(cpu_fp32_res) + cpu_fp32_abs[cpu_fp32_abs == 0.0] = 1e-20 + max_rtol = (np.abs(ipu_fp16_res - cpu_fp32_res) / + cpu_fp32_abs).max() + raise AssertionError( + f"ipu_fp16 check failed. max_atol is {max_atol}, max_rtol is {max_rtol}" + ) + + if check_shape: + self.assertTrue(ipu_fp16_res.shape == cpu_fp32_res.shape) # Execution Mode class ExecutionMode(IntEnum): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py new file mode 100644 index 0000000000000..09a251585b381 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py @@ -0,0 +1,98 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return False + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 32, 32]) + self.feed_fp32 = {'data': data.astype(np.float32)} + self.feed_fp16 = {'data': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {} + self.attrs['data_layout'] = 'NCHW' + + @IPUOpTest.static_graph + def build_model(self): + data = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + input_scale = paddle.fluid.layers.create_parameter( + shape=[self.feed_shape[0][1]], dtype="float32") + input_bias = paddle.fluid.layers.create_parameter( + shape=[self.feed_shape[0][1]], dtype="float32") + out = paddle.fluid.layers.affine_channel(data, + scale=input_scale, + bias=input_bias) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_data_feed(self): + data = np.random.uniform(size=[2, 4, 64, 64]) + self.feed_fp32 = {'data': data.astype(np.float32)} + self.feed_fp16 = {'data': data.astype(np.float16)} + + +@unittest.skip("Only support NCHW") +class TestNHWC(TestBase): + + def set_op_attrs(self): + self.attrs = {} + self.attrs['data_layout'] = 'NHWC' + + def set_data_feed(self): + data = np.random.uniform(size=[2, 64, 64, 3]) + self.feed_fp32 = {'data': data.astype(np.float32)} + self.feed_fp16 = {'data': data.astype(np.float16)} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py new file mode 100644 index 0000000000000..121755226ec34 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py @@ -0,0 +1,101 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest +import paddle.nn.functional as F + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_data_feed(self): + x = np.random.uniform(size=[3, 4, 2, 2]) + target = np.random.uniform(size=[3, 4, 2, 2]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "target": target.astype(np.float32) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "target": target.astype(np.float16) + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = { + 'reduction': 'mean', + } + + @IPUOpTest.static_graph + def build_model(self, on_ipu): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + target = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='float32') + out = F.binary_cross_entropy(x, target, **self.attrs) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model(self.is_ipu_mode(m)) + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_op_attrs(self): + self.attrs = { + 'reduction': 'sum', + } + + +class TestCase2(TestBase): + + def set_op_attrs(self): + self.attrs = { + 'reduction': 'none', + } + + def set_atol(self): + self.atol = 1e-10 + self.rtol = 1e-6 + self.atol_fp16 = 5e-2 + self.rtol_fp16 = 2e-2 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py new file mode 100644 index 0000000000000..c61685e4a5e30 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py @@ -0,0 +1,220 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_feed() + self.set_op_attrs() + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_feed(self): + data = np.random.uniform(size=[5, 5]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {} + self.attrs['min'] = 0.1 + self.attrs['max'] = 3.4 + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + x = paddle.clip(x, **self.attrs) + self.fetch_list = [x.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestNoMin(TestBase): + + def set_op_attrs(self): + self.attrs = {} + self.attrs['max'] = 3.4 + + +class TestNoMax(TestBase): + + def set_op_attrs(self): + self.attrs = {} + self.attrs['min'] = 0.1 + + +class TestNoMinNoMax(TestBase): + + def set_op_attrs(self): + self.attrs = {} + + +class TestMinMaxTensor(TestBase): + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + min = paddle.fluid.layers.fill_constant(name="min", + shape=[1], + dtype='float32', + value=0.1) + max = paddle.fluid.layers.fill_constant(name="max", + shape=[1], + dtype='float32', + value=3.4) + x = paddle.clip(x, min=min, max=max) + self.fetch_list = [x.name] + + +class TestMinTensor(TestBase): + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + min = paddle.fluid.layers.fill_constant(name="min", + shape=[1], + dtype='float32', + value=0.1) + x = paddle.clip(x, min=min) + self.fetch_list = [x.name] + + +class TestMaxTensor(TestBase): + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + max = paddle.fluid.layers.fill_constant(name="max", + shape=[1], + dtype='float32', + value=3.4) + x = paddle.clip(x, max=max) + self.fetch_list = [x.name] + + +class TestCombine1(TestBase): + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + min = paddle.fluid.layers.fill_constant(name="min", + shape=[1], + dtype='float32', + value=0.1) + x = paddle.clip(x, min=min, max=3.4) + self.fetch_list = [x.name] + + +class TestCombine2(TestBase): + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + max = paddle.fluid.layers.fill_constant(name="max", + shape=[1], + dtype='float32', + value=3.4) + x = paddle.clip(x, min=0.1, max=max) + self.fetch_list = [x.name] + + +class TestIntInput(TestBase): + + def set_feed(self): + data = np.random.uniform(size=[5, 5]) + self.feed_fp32 = {'x': data.astype(np.int32)} + self.feed_fp16 = {'x': data.astype(np.int32)} + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='int32') + + x = paddle.clip(x, min=0.1, max=3.4) + self.fetch_list = [x.name] + + +class TestIntMinMax(TestBase): + + def set_feed(self): + data = np.random.uniform(size=[5, 5]) + self.feed_fp32 = {'x': data.astype(np.int32)} + self.feed_fp16 = {'x': data.astype(np.int32)} + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='int32') + min = paddle.fluid.layers.fill_constant(name="min", + shape=[1], + dtype='int32', + value=1) + max = paddle.fluid.layers.fill_constant(name="max", + shape=[1], + dtype='int32', + value=3) + x = paddle.clip(x, min=min, max=max) + self.fetch_list = [x.name] + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py new file mode 100644 index 0000000000000..64fdcc26636cf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py @@ -0,0 +1,162 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_feed() + self.set_op_attrs() + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_feed(self): + data = np.random.uniform(size=[1, 3, 8, 8]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {} + self.attrs['num_filters'] = 3 + self.attrs['filter_size'] = 3 + self.attrs['padding'] = 0 + self.attrs['stride'] = 1 + self.attrs['dilation'] = 1 + self.attrs['bias_attr'] = False + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + x = paddle.static.nn.conv2d_transpose(x, **self.attrs) + self.fetch_list = [x.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['stride'] = 2 + + +@unittest.skip("Only support dilation=1") +class TestCase2(TestBase): + + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['stride'] = 2 + self.attrs['dilation'] = 2 + + +class TestCase3(TestBase): + + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['padding'] = 2 + + +class TestCase4(TestBase): + + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['padding'] = "SAME" + + +class TestCase5(TestBase): + + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['stride'] = 2 + self.attrs['padding'] = "SAME" + + +class TestCase6(TestBase): + + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['padding'] = "VALID" + + +class TestCase7(TestBase): + + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['padding'] = "VALID" + self.attrs['stride'] = 2 + + +class TestCase8(TestBase): + + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['filter_size'] = 4 + self.attrs['stride'] = 2 + + +class TestCase9(TestBase): + + # When bias_attr is not False, a Add Op will be added after conv2d_transpose Op. + # When bias_attr = None, the bias value is 0. + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['bias_attr'] = None + + +class TestCase10(TestBase): + + # When output_size is not None, the filter_size will be re-computed by output_size + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['filter_size'] = None + self.attrs['output_size'] = [12, 12] + + +class TestCase11(TestBase): + + # Depthwise conv2d transpose + def set_op_attrs(self): + super().set_op_attrs() + self.attrs['groups'] = 3 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py index 5a2485e251c96..8fe7ee53ca2a8 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py @@ -108,7 +108,7 @@ def set_op_attrs(self): class TestCase5(TestBase): - + # Depthwise conv2d def set_op_attrs(self): super().set_op_attrs() self.attrs['groups'] = 3 diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py index ffd4368c089b5..5c456e2f4c331 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py @@ -116,5 +116,35 @@ def set_op_attrs(self): } +class TestCase4(TestBase): + + def set_data_feed(self): + x = np.random.uniform(size=[3, 5, 7]) + label = np.random.randint(0, 7, [3, 5, 1], dtype='int64') + self.feed_fp32 = { + "x": x.astype(np.float32), + "label": label.astype(np.int64) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "label": label.astype(np.int32) + } + + +class TestCase5(TestBase): + + def set_data_feed(self): + x = np.random.uniform(size=[3, 5, 6, 7]) + label = np.random.randint(0, 7, [3, 5, 6], dtype='int64') + self.feed_fp32 = { + "x": x.astype(np.float32), + "label": label.astype(np.int64) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "label": label.astype(np.int32) + } + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py index 75cd3c92322ab..99cb47394ff5e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py @@ -86,5 +86,35 @@ def set_op_attrs(self): self.attrs = {"exclusive": True, "reverse": True} +class TestCase4(TestBase): + + def set_data_feed(self): + x = np.random.uniform(size=[1, 128]) + self.feed_fp32 = {"x": x.astype(np.int32)} + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="int32") + out = paddle.fluid.layers.cumsum(x, **self.attrs) + self.fetch_list = [out.name] + + +class TestCase5(TestBase): + + def set_data_feed(self): + x = np.random.uniform(size=[1, 128]) + self.feed_fp32 = {"x": x.astype(np.int64)} + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="int64") + out = paddle.fluid.layers.cumsum(x, **self.attrs) + self.fetch_list = [out.name] + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py new file mode 100644 index 0000000000000..94225660f4d59 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py @@ -0,0 +1,130 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_feed() + self.set_op_attrs() + + def set_op_attrs(self): + self.attrs = {} + + def set_feed(self): + data = np.random.uniform(size=[32, 100]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + x = paddle.static.nn.data_norm(input=x, **self.attrs) + self.fetch_list = [x.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_op_attrs(self): + self.attrs = {"in_place": True} + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + x = paddle.static.nn.data_norm(input=x, **self.attrs) + x = x + 1 + self.fetch_list = [x.name] + + +@unittest.skip("Do not support in_place=True when test single data_norm Op") +class TestCase2(TestBase): + + def set_op_attrs(self): + self.attrs = {"in_place": True} + + +class TestCase3(TestBase): + + def set_op_attrs(self): + self.attrs = {"data_layout": "NHWC"} + + +class TestCase4(TestBase): + + def set_op_attrs(self): + self.attrs = {"epsilon": 0.001} + + +class TestCase5(TestBase): + + def set_op_attrs(self): + self.attrs = {"do_model_average_for_mean_and_var": True} + + +class TestCase6(TestBase): + # If enable_scale_and_shift=True, it requires to set values of scale and bias in `param_attr` + def set_op_attrs(self): + self.attrs = { + "param_attr": { + "scale_w": 0.5, + "bias": 0.1 + }, + "enable_scale_and_shift": True + } + + +class TestCase7(TestBase): + + def set_op_attrs(self): + self.attrs = { + "param_attr": { + "batch_size": 1e3, + "batch_sum": 0.1, + "batch_square": 1e3, + "scale_w": 0.5, + "bias": 0.1 + }, + "enable_scale_and_shift": True + } + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py new file mode 100644 index 0000000000000..c84e8ce9bebad --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_data_feed(self): + data_x = np.random.uniform(size=[8, 1, 6, 1]) + data_y = np.random.uniform(size=[7, 1, 5]) + self.feed_fp32 = { + "x": data_x.astype(np.float32), + "y": data_y.astype(np.float32) + } + self.feed_fp16 = { + "x": data_x.astype(np.float16), + "y": data_y.astype(np.float16) + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def set_op_attrs(self): + self.attrs = {"p": 2} + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + y = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='float32') + out = paddle.dist(x, y, **self.attrs) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_op_attrs(self): + self.attrs = {"p": 0} + + +class TestCase2(TestBase): + + def set_op_attrs(self): + self.attrs = {"p": float("inf")} + + +class TestCase3(TestBase): + + def set_op_attrs(self): + self.attrs = {"p": float("-inf")} + + +if __name__ == "__main__": + unittest.main() From e47e82d0a31f2febe5161373fbaa930de30b1e03 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Tue, 12 Jul 2022 11:18:17 +0800 Subject: [PATCH 134/250] [IPU] add more UTs 1/N (#44207) * add authors Co-authored-by: Allen Guo Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen * squash py changes 1/N Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen --- .../unittests/ipu/test_dy2static_fp16_ipu.py | 19 +- .../tests/unittests/ipu/test_dy2static_ipu.py | 120 +++++++--- .../unittests/ipu/test_expand_as_v2_op_ipu.py | 104 +++++++++ .../unittests/ipu/test_expand_v2_op_ipu.py | 115 ++++++++++ .../ipu/test_fill_any_like_op_ipu.py | 19 ++ .../test_flatten_contiguous_range_op_ipu.py | 92 ++++++++ .../tests/unittests/ipu/test_flip_op_ipu.py | 90 ++++++++ .../unittests/ipu/test_greater_op_ipu.py | 12 + .../unittests/ipu/test_huber_loss_op_ipu.py | 95 ++++++++ .../unittests/ipu/test_interpolate_ops_ipu.py | 211 ++++++++++++++++++ 10 files changed, 840 insertions(+), 37 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py index 1484c9fdcb53e..5168a6db339dc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py @@ -14,16 +14,12 @@ from __future__ import print_function -import numpy as np +import tempfile import unittest -import sys -import os + +import numpy as np import paddle -import paddle.fluid as fluid -from paddle.jit import to_static -from paddle.utils.cpp_extension import load -from paddle.optimizer.lr import LRScheduler -import tempfile +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest SEED = 2022 @@ -52,7 +48,9 @@ def forward(self, x, target=None): return x -class TestBase(unittest.TestCase): +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): @classmethod def setUpClass(cls): @@ -124,6 +122,9 @@ def _test(self, use_ipu=False): result.append(loss) + if use_ipu: + ipu_strategy.release_patch() + return np.array(result) def test_training(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py index 28decc76a421c..4cc9baea9f4b6 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py @@ -14,59 +14,84 @@ from __future__ import print_function -import numpy as np +import tempfile import unittest -import sys + +import numpy as np import paddle -import paddle.fluid as fluid +from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramCache +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest from paddle.jit import to_static -from paddle.utils.cpp_extension import load from paddle.optimizer.lr import LRScheduler -from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramCache -import tempfile +from functools import partial SEED = 2022 class SimpleLayer(paddle.nn.Layer): - def __init__(self, use_ipu=False): + def __init__(self, + loss_op=None, + use_softmax=True, + use_reduction=True, + use_identity_loss=True): super(SimpleLayer, self).__init__() - self.use_ipu = use_ipu + self.loss_op = loss_op self.conv = paddle.nn.Conv2D(in_channels=3, out_channels=1, kernel_size=2, stride=1) + self.use_softmax = use_softmax + self.use_reduction = use_reduction + self.use_identity_loss = use_identity_loss @to_static() def forward(self, x, target=None): x = self.conv(x) x = paddle.fluid.layers.flatten(x, axis=1) if target is not None: - x = paddle.fluid.layers.softmax(x) - loss = paddle.fluid.layers.cross_entropy(x, target) - if self.use_ipu: - loss = paddle.incubate.identity_loss(loss, 1) + if self.use_softmax: + x = paddle.fluid.layers.softmax(x) + if self.loss_op: + loss = self.loss_op(x, target) else: + loss = paddle.fluid.layers.cross_entropy(x, target) + if self.use_reduction: loss = paddle.mean(loss) + if self.use_identity_loss: + loss = paddle.incubate.identity_loss(loss, 1) return x, loss return x -class TestBase(unittest.TestCase): +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): - @classmethod - def setUpClass(cls): + def setUp(self): paddle.disable_static() + self.set_op_attrs() + self.set_data_feed() + + def set_op_attrs(self): + self.loss_op = paddle.fluid.layers.cross_entropy + + def set_data_feed(self): + self.data = paddle.uniform((32, 3, 10, 10), dtype='float32') + self.label = paddle.randint(0, 10, shape=[32], dtype='int64') + + def create_model(self, use_ipu=False): + return SimpleLayer(loss_op=self.loss_op, + use_softmax=True, + use_reduction=not use_ipu, + use_identity_loss=use_ipu) def _test(self, use_ipu=False): paddle.seed(SEED) np.random.seed(SEED) - model = SimpleLayer(use_ipu) + model = self.create_model(use_ipu) optim = paddle.optimizer.Adam(learning_rate=0.01, parameters=model.parameters()) - data = paddle.uniform((32, 3, 10, 10), dtype='float32') - label = paddle.randint(0, 10, shape=[32], dtype='int64') if use_ipu: device = paddle.set_device('ipu') @@ -80,7 +105,7 @@ def _test(self, use_ipu=False): result = [] for epoch in range(100): # ipu only needs call model() to do forward/backward/grad_update - pred, loss = model(data, label) + pred, loss = model(self.data, self.label) if not use_ipu: loss.backward() optim.step() @@ -104,7 +129,6 @@ class TestSaveLoad(TestBase): @classmethod def setUpClass(cls): - paddle.disable_static() cls.save_path = tempfile.TemporaryDirectory() @classmethod @@ -114,11 +138,9 @@ def tearDownClass(cls): def _test(self, use_ipu=False): paddle.seed(SEED) np.random.seed(SEED) - model = SimpleLayer(use_ipu) + model = self.create_model(use_ipu) optim = paddle.optimizer.Adam(learning_rate=0.01, parameters=model.parameters()) - data = paddle.uniform((32, 3, 10, 10), dtype='float32') - label = paddle.randint(0, 10, shape=[32], dtype='int64') model_path = '{}/model_state_dict_{}.pdparams'.format( self.save_path, 'ipu' if use_ipu else 'cpu') optim_path = '{}/optim_state_dict_{}.pdopt'.format( @@ -136,7 +158,7 @@ def _test(self, use_ipu=False): result = [] for epoch in range(100): # ipu only needs call model() to do forward/backward/grad_update - pred, loss = model(data, label) + pred, loss = model(self.data, self.label) if not use_ipu: loss.backward() optim.step() @@ -155,7 +177,7 @@ def _test(self, use_ipu=False): for epoch in range(100): # ipu only needs call model() to do forward/backward/grad_update - pred, loss = model(data, label) + pred, loss = model(self.data, self.label) if not use_ipu: loss.backward() optim.step() @@ -169,10 +191,11 @@ def _test(self, use_ipu=False): return np.array(result) -class TestPatch(unittest.TestCase): +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestPatch(IPUOpTest): - @classmethod - def setUpClass(cls): + def setUp(cls): paddle.disable_static() def test(self, use_ipu=False): @@ -189,5 +212,46 @@ def test(self, use_ipu=False): self.assertTrue(reset_step is old_step) +class TestWithoutIdentityLoss1(TestBase): + + def create_model(self, use_ipu=False): + return SimpleLayer(loss_op=self.loss_op, + use_softmax=True, + use_reduction=True, + use_identity_loss=False) + + +class TestWithoutIdentityLoss2(TestBase): + + def set_op_attrs(self): + self.loss_op = paddle.fluid.layers.softmax_with_cross_entropy + + def set_data_feed(self): + self.data = paddle.uniform((32, 3, 10, 10), dtype='float32') + self.label = paddle.randint(0, 10, shape=[32, 1], dtype='int64') + + def create_model(self, use_ipu=False): + return SimpleLayer(loss_op=self.loss_op, + use_softmax=False, + use_reduction=True, + use_identity_loss=False) + + +class TestWithoutIdentityLoss3(TestBase): + + def set_op_attrs(self): + self.loss_op = partial(paddle.fluid.layers.kldiv_loss, reduction="none") + + def set_data_feed(self): + self.data = paddle.uniform((32, 3, 10, 10), dtype='float32') + self.label = paddle.rand(shape=[32, 81], dtype='float32') + + def create_model(self, use_ipu=False): + return SimpleLayer(loss_op=self.loss_op, + use_softmax=True, + use_reduction=True, + use_identity_loss=False) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py new file mode 100644 index 0000000000000..b299d9cfac728 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py @@ -0,0 +1,104 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + def set_data_feed(self): + data_x = np.random.uniform(size=[1, 3]) + data_y = np.random.uniform(size=[2, 2, 3]) + self.feed_fp32 = { + 'x': data_x.astype(np.float32), + 'y': data_y.astype(np.float32) + } + self.feed_fp16 = { + 'x': data_x.astype(np.float16), + 'y': data_y.astype(np.float16) + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + y = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype="float32") + out = paddle.expand_as(x, y) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_data_feed(self): + data_x = np.random.uniform(size=[2, 3]) + data_y = np.random.uniform(size=[2, 4, 2, 3]) + self.feed_fp32 = { + 'x': data_x.astype(np.float32), + 'y': data_y.astype(np.float32) + } + self.feed_fp16 = { + 'x': data_x.astype(np.float16), + 'y': data_y.astype(np.float16) + } + + +@unittest.skip("corresponding dimensions must have the same value.") +class TestCase2(TestBase): + + def set_data_feed(self): + data_x = np.random.uniform(size=[2, 3]) + data_y = np.random.uniform(size=[2, 4, 3, 3]) + self.feed_fp32 = { + 'x': data_x.astype(np.float32), + 'y': data_y.astype(np.float32) + } + self.feed_fp16 = { + 'x': data_x.astype(np.float16), + 'y': data_y.astype(np.float16) + } + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py new file mode 100644 index 0000000000000..77872c9ebe47d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py @@ -0,0 +1,115 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_attrs() + + def set_data_feed(self): + data = np.random.uniform(size=[2, 3]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def set_attrs(self): + self.attrs = {"shape": [2, 2, 3]} + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + out = paddle.expand(x, **self.attrs) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_attrs(self): + self.attrs = {"shape": [5, 2, 2, 3]} + + +class TestCase2(TestBase): + + def set_data_feed(self): + data = np.random.uniform(size=[2, 1, 3]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} + + def set_attrs(self): + self.attrs = {"shape": [5, 2, 2, 3]} + + +@unittest.skip("corresponding dimensions must have the same value.") +class TestCase3(TestBase): + + def set_attrs(self): + self.attrs = {"shape": [5, 2, 4, 3]} + + +@unittest.skip("Do not support `shape` = Tensors.") +class TestCase4(TestBase): + + def set_data_feed(self): + data = np.random.uniform(size=[3, 3]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + self.attrs = { + 'name': 'y', + 'shape': [3], + 'dtype': 'int32', + 'value': 3, + } + y = paddle.fluid.layers.fill_constant(**self.attrs) + out = paddle.expand(x, shape=y) + self.fetch_list = [out.name] + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py index a6c497433020c..28e569d911847 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py @@ -69,5 +69,24 @@ def set_op_attrs(self): self.attrs = {'fill_value': 3, 'dtype': 'int32'} +class TestError(TestBase): + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.fluid.data('x', [-1, 3, 13], 'float32') + x_fill = paddle.full_like(x, **self.attrs) + out = paddle.fluid.layers.elementwise_add(x_fill, x_fill) + self.fetch_list = [out.name] + + def test(self): + self.build_model() + + def test_error(): + self.run_op_test(IPUOpTest.ExecutionMode.IPU_FP32) + + self.assertRaisesRegex(Exception, "Please check tensor shape setting", + test_error) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py new file mode 100644 index 0000000000000..4f84f20c1f1d5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py @@ -0,0 +1,92 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_data_feed(self): + data = np.random.uniform(size=[2, 2, 4, 6]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {} + self.attrs['start_axis'] = 0 + self.attrs['stop_axis'] = -1 + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + out = paddle.flatten(x=x, **self.attrs) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_op_attrs(self): + self.attrs = {} + self.attrs['start_axis'] = 0 + self.attrs['stop_axis'] = 2 + + +class TestCase2(TestBase): + + def set_op_attrs(self): + self.attrs = {} + self.attrs['start_axis'] = 1 + self.attrs['stop_axis'] = -1 + + +class TestCase3(TestBase): + + def set_op_attrs(self): + self.attrs = {} + self.attrs['start_axis'] = 1 + self.attrs['stop_axis'] = 2 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py new file mode 100644 index 0000000000000..17b1bd9b2d0ea --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py @@ -0,0 +1,90 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_feed(self): + data = np.random.uniform(size=[3, 2, 2]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def set_op_attrs(self): + self.attrs = {} + self.attrs['axis'] = [0, 1] + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype=self.feed_dtype[0]) + x = paddle.flip(x, **self.attrs) + self.fetch_list = [x.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_feed(self): + data = np.random.randint(0, 10, size=[3, 2, 2]) + self.feed_fp32 = {'x': data.astype(np.int32)} + self.feed_fp16 = {'x': data.astype(np.int32)} + + +class TestCase2(TestBase): + + def set_feed(self): + data = np.random.randint(0, 2, size=[4, 3, 2, 2]) + self.feed_fp32 = {'x': data.astype(np.bool)} + self.feed_fp16 = {'x': data.astype(np.bool)} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py index eb3c0601dd148..56845eef475fa 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py @@ -127,5 +127,17 @@ def set_test_op(self): self.op = paddle.fluid.layers.equal +class TestGreaterEqual(TestGreaterThan): + + def set_test_op(self): + self.op = paddle.fluid.layers.greater_equal + + +class TestLessEqual(TestGreaterThan): + + def set_test_op(self): + self.op = paddle.fluid.layers.less_equal + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py new file mode 100644 index 0000000000000..a28120d820e5d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest +import paddle.nn.functional as F + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_data_feed(self): + x = np.random.uniform(size=[3, 4, 2, 2]) + target = np.random.uniform(size=[3, 4, 2, 2]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "target": target.astype(np.float32) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "target": target.astype(np.float16) + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = { + 'delta': 1.0, + } + + @IPUOpTest.static_graph + def build_model(self, on_ipu): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + target = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='float32') + out = paddle.fluid.layers.huber_loss(x, target, **self.attrs) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model(self.is_ipu_mode(m)) + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_op_attrs(self): + self.attrs = { + 'delta': 0.5, + } + + +class TestCase2(TestBase): + + def set_op_attrs(self): + self.attrs = { + 'delta': 0.0, + } + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py new file mode 100644 index 0000000000000..0d15f20273f04 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py @@ -0,0 +1,211 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_data_feed(self): + x = np.random.uniform(size=[2, 3, 6, 10]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def set_op_attrs(self): + self.attrs = {} + self.attrs["size"] = [12, 12] + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + out = paddle.nn.functional.interpolate(x, **self.attrs) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestCase0(TestBase): + + def set_op_attrs(self): + self.attrs = {} + self.attrs["size"] = [3, 4] + + +class TestCase1(TestBase): + + def set_op_attrs(self): + self.attrs = {} + self.attrs["scale_factor"] = [2, 1] + + +@unittest.skip("Only one of size or scale_factor should be defined") +class TestCase2(TestBase): + + def set_op_attrs(self): + self.attrs = {"size": [12, 12], "scale_factor": [2, 1]} + + +class TestCase3(TestBase): + + def set_op_attrs(self): + self.attrs = {"scale_factor": 2.5} + + +class TestBilinear(TestBase): + + @property + def fp16_enabled(self): + return False + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_op_attrs(self): + self.attrs = {"size": [12, 12], "mode": "bilinear"} + + +# Take long time +class TestBicubic(TestBase): + + @property + def fp16_enabled(self): + return False + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_op_attrs(self): + self.attrs = {"size": [12, 12], "mode": "bicubic"} + + +# Trilinear requires 5-D input +class TestTrilinear(TestBase): + + @property + def fp16_enabled(self): + return False + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + x = np.random.uniform(size=[2, 3, 3, 6, 10]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} + + def set_op_attrs(self): + self.attrs = { + "size": [12, 12, 12], + "mode": "trilinear", + "data_format": "NCDHW" + } + + +# Linear requires 3-D input +class TestLinear(TestBase): + + @property + def fp16_enabled(self): + return False + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + x = np.random.uniform(size=[3, 6, 10]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} + + def set_op_attrs(self): + self.attrs = {"size": [12], "mode": "linear", "data_format": "NCW"} + + +@unittest.skip( + "Transfer to Pool Op with 2-D ksize, now we only support 1-D ksize.") +class TestArea(TestBase): + + def set_data_feed(self): + x = np.random.uniform(size=[2, 3, 6, 6]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} + + def set_op_attrs(self): + self.attrs = {"size": 12, "mode": "area"} + + +# align_corners option can only be set with the interpolating modes: linear | bilinear | bicubic | trilinear +class TestAlignCorners(TestBase): + + @property + def fp16_enabled(self): + return False + + def set_op_attrs(self): + self.attrs = { + "size": [12, 12], + "align_corners": True, + "mode": "bilinear" + } + + +# +class TestAlignMode(TestBase): + + def set_op_attrs(self): + self.attrs = {"size": [12, 12], "align_mode": 1} + + +if __name__ == "__main__": + unittest.main() From a05e7ef6fc1eaba2d709db3f2030f91d6fe1bf02 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Tue, 12 Jul 2022 11:18:28 +0800 Subject: [PATCH 135/250] [IPU] add more UTs 2/N (#44208) * add authors Co-authored-by: Allen Guo Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen * squash py changes 2/N Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen --- .../unittests/ipu/test_kldiv_loss_op_ipu.py | 95 +++++++++++ .../tests/unittests/ipu/test_matmul_op_ipu.py | 34 +++- .../unittests/ipu/test_matmul_v2_op_ipu.py | 30 ++++ .../unittests/ipu/test_meshgrid_op_ipu.py | 133 +++++++++++++++ .../unittests/ipu/test_model_pipeline_ipu.py | 100 +++++------- .../tests/unittests/ipu/test_p_norm_op_ipu.py | 75 +++++++++ .../tests/unittests/ipu/test_pad_op_ipu.py | 152 ++++++++++++++++++ .../tests/unittests/ipu/test_prelu_op_ipu.py | 1 - .../unittests/ipu/test_reduce_x_op_ipu.py | 54 +++++++ .../test_softmax_with_cross_entropy_op_ipu.py | 99 ++++++++++++ .../unittests/ipu/test_warpctc_op_ipu.py | 122 ++++++++++++++ 11 files changed, 833 insertions(+), 62 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py diff --git a/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py new file mode 100644 index 0000000000000..d6d48c650634d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest +import paddle.nn.functional as F + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_data_feed(self): + x = np.random.uniform(size=[3, 4, 2, 2]) + target = np.random.uniform(size=[3, 4, 2, 2]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "target": target.astype(np.float32) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "target": target.astype(np.float16) + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = { + 'reduction': 'mean', + } + + @IPUOpTest.static_graph + def build_model(self, on_ipu): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + target = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='float32') + out = paddle.fluid.layers.kldiv_loss(x, target, **self.attrs) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model(self.is_ipu_mode(m)) + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_op_attrs(self): + self.attrs = { + 'reduction': 'sum', + } + + +class TestCase2(TestBase): + + def set_op_attrs(self): + self.attrs = { + 'reduction': 'none', + } + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py index 222bb20209750..fb8cf86b71cd1 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py @@ -157,8 +157,8 @@ def set_op_attrs(self): class TestCase7(TestBase): def set_data_feed(self): - x = np.random.uniform(size=[1, 12, 128, 64]) - y = np.random.uniform(size=[1, 12, 128, 64]) + x = np.random.uniform(size=[1, 3, 4, 5]) + y = np.random.uniform(size=[1, 3, 4, 5]) self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} @@ -205,5 +205,35 @@ def set_data_feed(self): self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)} +class TestCase10(TestBase): + + def set_op_attrs(self): + self.attrs = { + "transpose_y": True, + } + + def set_data_feed(self): + x = np.random.uniform(size=[4, 2, 3]) + y = np.random.uniform(size=[2, 3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase11(TestBase): + + def set_op_attrs(self): + self.attrs = { + "transpose_x": True, + } + + def set_data_feed(self): + x = np.random.uniform(size=[4, 3, 2]) + y = np.random.uniform(size=[3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py index 4777c42da138e..6e84066a4a1b1 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py @@ -150,5 +150,35 @@ def set_data_feed(self): } +class TestCase9(TestBase): + + def set_op_attrs(self): + self.attrs = { + "transpose_y": True, + } + + def set_data_feed(self): + x = np.random.uniform(size=[4, 2, 3]) + y = np.random.uniform(size=[2, 3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase10(TestBase): + + def set_op_attrs(self): + self.attrs = { + "transpose_x": True, + } + + def set_data_feed(self): + x = np.random.uniform(size=[4, 3, 2]) + y = np.random.uniform(size=[3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py new file mode 100644 index 0000000000000..4efd4c5714bf8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py @@ -0,0 +1,133 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_feed(self): + data1 = np.random.uniform(size=[100]) + data2 = np.random.uniform(size=[200]) + self.feed_fp32 = { + 'x': data1.astype(np.float32), + 'y': data2.astype(np.float32) + } + self.feed_fp16 = { + 'x': data1.astype(np.float16), + 'y': data2.astype(np.float16) + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def set_op_attrs(self): + self.attrs = {} + self.attrs['axis'] = [0, 1] + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype=self.feed_dtype[0]) + y = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype=self.feed_dtype[1]) + r1, r2 = paddle.meshgrid(x, y) + self.fetch_list = [r1.name, r2.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + for k, v in self.output_dict.items(): + self.output_dict[k] = np.concatenate([vv.flatten() for vv in v]) + self.check() + + +class TestCase1(TestBase): + + def set_feed(self): + data1 = np.random.uniform(size=[10]) + data2 = np.random.uniform(size=[20]) + data3 = np.random.uniform(size=[30]) + self.feed_fp32 = { + 'x': data1.astype(np.float32), + 'y': data2.astype(np.float32), + 'z': data3.astype(np.float32) + } + self.feed_fp16 = { + 'x': data1.astype(np.float16), + 'y': data2.astype(np.float16), + 'z': data3.astype(np.float16) + } + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype=self.feed_dtype[0]) + y = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype=self.feed_dtype[1]) + z = paddle.static.data(name=self.feed_list[2], + shape=self.feed_shape[2], + dtype=self.feed_dtype[2]) + r1, r2, r3 = paddle.meshgrid(x, y, z) + self.fetch_list = [r1.name, r2.name, r3.name] + + +class TestCase2(TestBase): + + def set_feed(self): + data1 = np.random.uniform(size=[100]) + data2 = np.random.uniform(size=[200]) + self.feed_fp32 = { + 'x': data1.astype(np.int32), + 'y': data2.astype(np.int32) + } + self.feed_fp16 = { + 'x': data1.astype(np.int32), + 'y': data2.astype(np.int32) + } + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py index 27538610a42b7..9f7ebc52834ac 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py @@ -12,79 +12,61 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function +import unittest import numpy as np -import unittest import paddle import paddle.static - -paddle.enable_static() -SEED = 2021 +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest @unittest.skipIf(not paddle.is_compiled_with_ipu(), "core is not compiled with IPU") -class TestCastNet(unittest.TestCase): - - def _test(self, run_ipu=True): - scope = paddle.static.Scope() - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - - np_image = np.random.rand(1, 3, 10, 10).astype(np.float32) +class TestBase(IPUOpTest): - with paddle.static.scope_guard(scope): - with paddle.static.program_guard(main_prog, startup_prog): - image = paddle.static.data(name='image', - shape=[1, 3, 10, 10], - dtype='float32') - with paddle.static.ipu_shard_guard(index=0): - conv1 = paddle.static.nn.conv2d(image, - num_filters=3, - filter_size=3, - bias_attr=False) - with paddle.static.ipu_shard_guard(index=1): - conv2 = paddle.static.nn.conv2d(conv1, - num_filters=3, - filter_size=3, - bias_attr=False) - loss = paddle.mean(conv2) + def setUp(self): + self.set_training() + self.set_data_feed() + self.set_feed_attr() - if run_ipu: - place = paddle.IPUPlace() - else: - place = paddle.CPUPlace() - executor = paddle.static.Executor(place) - executor.run(startup_prog) + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 10, 10]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} - if run_ipu: - feed_list = [image.name] - fetch_list = [loss.name] - ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.set_graph_config(num_ipus=2, - is_training=False, - enable_manual_shard=True) - ipu_strategy.set_pipelining_config(enable_pipelining=False) - program = paddle.static.IpuCompiledProgram( - main_prog, - ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) - else: - program = main_prog + def set_feed_attr(self): + self.feed_shape = [(1, 3, 10, 10)] + self.feed_list = list(self.feed_fp32.keys()) - loss_res = executor.run(program, - feed={"image": np_image}, - fetch_list=[loss]) - return loss_res + @IPUOpTest.static_graph + def build_model(self): + image = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + with paddle.static.ipu_shard_guard(index=0): + conv1 = paddle.static.nn.conv2d(image, + num_filters=3, + filter_size=3, + bias_attr=False) + with paddle.static.ipu_shard_guard(index=1): + conv2 = paddle.static.nn.conv2d(conv1, + num_filters=3, + filter_size=3, + bias_attr=False) + loss = paddle.mean(conv2) + self.fetch_list = [loss.name] - def test_cast(self): - cpu_outputs = self._test(False) - ipu_outputs = self._test(True) + def run_model(self, exec_mode): + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(num_ipus=2, + is_training=False, + enable_manual_shard=True) + ipu_strategy.set_pipelining_config(enable_pipelining=True, + batches_per_step=2) + self.run_op_test(exec_mode, ipu_strategy=ipu_strategy) - self.assertTrue(np.allclose(cpu_outputs, ipu_outputs, atol=1e-4)) + def test(self): + self.build_model() + self.run_model(IPUOpTest.ExecutionMode.IPU_FP32) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py new file mode 100644 index 0000000000000..ec333ddff01b6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py @@ -0,0 +1,75 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_feed() + self.set_op_attrs() + + def set_op_attrs(self): + self.attrs = {"p": 2} + + def set_feed(self): + data = np.random.uniform(size=[2, 3, 4]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + x = paddle.nn.functional.normalize(x, **self.attrs) + self.fetch_list = [x.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +class TestCase1(TestBase): + + def set_op_attrs(self): + self.attrs = {"axis": 1} + + +class TestCase2(TestBase): + + def set_op_attrs(self): + self.attrs = {"p": 3.5, "axis": 1, "epsilon": 1e-3} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py new file mode 100644 index 0000000000000..02a488180aa0e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py @@ -0,0 +1,152 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_feed() + self.set_op_attrs() + + def set_feed(self): + data = np.random.uniform(size=[5, 4, 2, 3]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {"pad": [1, 2, 3, 4]} + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + pad = paddle.nn.functional.pad(x, **self.attrs) + self.fetch_list = [pad.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +@unittest.skip("Do not support `pad` as a tensor") +class TestCase1(TestBase): + + def set_op_attrs(self): + self.attrs = {} + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + const_attrs = { + 'name': 'y', + 'shape': [4], + 'dtype': 'int32', + 'value': 2, + } + y = paddle.fluid.layers.fill_constant(**const_attrs) + pad = paddle.nn.functional.pad(x, pad=y) + self.fetch_list = [pad.name] + + +class TestCase2(TestBase): + + def set_op_attrs(self): + self.attrs = {"pad": [2, 5], "data_format": "NCL"} + + def set_feed(self): + data = np.random.uniform(size=[4, 2, 3]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + +class TestCase3(TestBase): + + def set_op_attrs(self): + self.attrs = {"pad": [2, 5, 2, 3, 6, 3], "data_format": "NCDHW"} + + def set_feed(self): + data = np.random.uniform(size=[2, 3, 4, 2, 3]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + +class TestCase4(TestBase): + + def set_op_attrs(self): + self.attrs = {"pad": [2, 2, 1, 1], "mode": "reflect"} + + +@unittest.skip("replicate mode is not supported") +class TestCase5(TestBase): + + def set_op_attrs(self): + self.attrs = {"pad": [1, 2, 3, 4], "mode": "replicate"} + + +@unittest.skip("circular mode is not supported") +class TestCase6(TestBase): + + def set_op_attrs(self): + self.attrs = {"pad": [1, 2, 3, 4], "mode": "circular"} + + +@unittest.skip("Only support NCL, NCHW, NCDHW") +class TestCase7(TestBase): + + def set_op_attrs(self): + self.attrs = {"pad": [1, 2], "data_format": "NLC"} + + +@unittest.skip("Only support NCL, NCHW, NCDHW") +class TestCase7(TestBase): + + def set_op_attrs(self): + self.attrs = {"pad": [1, 2, 3, 4], "data_format": "NHWC"} + + +@unittest.skip("Only support NCL, NCHW, NCDHW") +class TestCase7(TestBase): + + def set_op_attrs(self): + self.attrs = {"pad": [1, 2, 3, 4, 1, 3], "data_format": "NDHWC"} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py index b06b0dc96f17f..0200cce0a33d0 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py @@ -61,7 +61,6 @@ def build_model(self): def run_model(self, exec_mode): ipu_strategy = paddle.static.IpuStrategy() ipu_strategy.set_graph_config(is_training=self.is_training) - ipu_strategy.set_options({'onnx_dump_path': 'onnx_dump_path.onnx'}) self.run_op_test(exec_mode, ipu_strategy=ipu_strategy) def test(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py index ffa3c6d155025..4cfbb9a5e0b58 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py @@ -148,5 +148,59 @@ def set_test_op(self): self.op = paddle.fluid.layers.reduce_sum +class TestLogsumexp(TestMean): + + def set_test_op(self): + self.op = paddle.logsumexp + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + if 'dim' in self.attrs: + self.attrs['axis'] = self.attrs['dim'] + del self.attrs['dim'] + if 'keep_dim' in self.attrs: + self.attrs['keepdim'] = self.attrs['keep_dim'] + del self.attrs['keep_dim'] + out = self.op(x, **self.attrs) + self.fetch_list = [out.name] + + +class TestAll(TestMean): + + @property + def fp16_enabled(self): + return False + + def set_data_feed0(self): + data = np.random.choice(a=[False, True], size=(2, 4)) + self.feed_fp32 = {"in_0": data.astype(bool)} + self.set_feed_attr() + + def set_data_feed1(self): + data = np.random.choice(a=[False, True], size=(2, 2, 2)) + self.feed_fp32 = {"in_0": data.astype(bool)} + self.set_feed_attr() + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='bool') + out = self.op(x, **self.attrs) + self.fetch_list = [out.name] + + def set_test_op(self): + self.op = paddle.fluid.layers.reduce_all + + +class TestAny(TestAll): + + def set_test_op(self): + self.op = paddle.fluid.layers.reduce_any + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py index 97b0c25f9380e..21021cd9f598d 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py @@ -106,5 +106,104 @@ def set_data_feed(self): } +class TestCase3(TestBase): + + def set_data_feed(self): + x = np.random.uniform(size=[3, 5, 7]) + label = np.random.randint(0, 7, [3, 5, 1], dtype='int64') + self.feed_fp32 = { + "x": x.astype(np.float32), + "label": label.astype(np.int64) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "label": label.astype(np.int32) + } + + +class TestCase4(TestBase): + + def set_op_attrs(self): + self.attrs = { + 'soft_label': False, + 'return_softmax': True, + 'ignore_index': 1, + } + + @IPUOpTest.static_graph + def build_model(self, on_ipu): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + if on_ipu: + label = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='int32') + else: + label = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='int64') + loss, softmax = F.softmax_with_cross_entropy(x, label, **self.attrs) + self.fetch_list = [loss.name, softmax.name] + + def run_model(self, exec_mode): + if self.is_ipu_mode(exec_mode): + self.feed_fp32['label'] = self.feed_fp32['label'].astype(np.int32) + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model(self.is_ipu_mode(m)) + self.run_model(m) + self.check() + + +class TestCase5(TestCase4): + + def set_op_attrs(self): + self.attrs = { + 'soft_label': False, + 'return_softmax': True, + 'ignore_index': 1, + 'axis': 1, + } + + def set_data_feed(self): + x = np.random.uniform(size=[3, 5, 7, 11]) + label = np.random.randint(0, 5, [3, 1, 7, 11], dtype='int64') + self.feed_fp32 = { + "x": x.astype(np.float32), + "label": label.astype(np.int64) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "label": label.astype(np.int32) + } + + +class TestCase6(TestCase4): + + def set_op_attrs(self): + self.attrs = { + 'soft_label': False, + 'return_softmax': True, + 'ignore_index': 1, + 'axis': 2, + } + + def set_data_feed(self): + x = np.random.uniform(size=[3, 5, 7, 9, 11]) + label = np.random.randint(0, 7, [3, 5, 1, 9, 11], dtype='int64') + self.feed_fp32 = { + "x": x.astype(np.float32), + "label": label.astype(np.int64) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "label": label.astype(np.int32) + } + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py new file mode 100644 index 0000000000000..8387b35015534 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py @@ -0,0 +1,122 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest +import paddle.nn.functional as F + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_training(self): + # ctcloss only support training currently. + self.is_training = True + self.epoch = 1 + + def set_data_feed(self): + self.batch_size = 16 + self.max_seq_length = 5 + self.max_label_length = 3 + self.num_classes = 5 + self.logits_length = np.array([self.max_seq_length] * self.batch_size, + dtype=np.int64) + self.labels_length = np.array([self.max_label_length] * self.batch_size, + dtype=np.int64) + self.blank = self.num_classes - 1 + self.norm_by_times = False + + logits = np.random.uniform( + 0.1, 1.0, [self.max_seq_length, self.batch_size, self.num_classes + ]).astype("float32") + labels = np.random.randint(0, + self.num_classes - 1, + [self.batch_size, self.max_label_length], + dtype="int32") + + self.feed_fp32 = { + "Logits": logits, + "Label": labels, + "input_length": self.logits_length.astype("int64"), + "label_length": self.labels_length.astype("int64"), + } + self.feed_fp16 = { + "Logits": logits.astype(np.float16), + "Label": labels, + "input_length": self.logits_length.astype("int64"), + "label_length": self.labels_length.astype("int64"), + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = { + "blank": self.blank, + "norm_by_times": self.norm_by_times, + } + + @IPUOpTest.static_graph + def build_model(self): + data = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + logits = paddle.nn.Linear(self.num_classes, + self.num_classes, + bias_attr=False)(data) + labels = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='int32') + input_length = paddle.static.data(name=self.feed_list[2], + shape=self.feed_shape[2], + dtype='int64') + label_length = paddle.static.data(name=self.feed_list[3], + shape=self.feed_shape[3], + dtype='int64') + out = paddle.fluid.layers.warpctc(logits, + labels, + input_length=input_length, + label_length=label_length, + **self.attrs) + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) + self.fetch_list = [loss.name, out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + for m in IPUOpTest.ExecutionMode: + if not self.skip_mode(m): + self.build_model() + self.run_model(m) + self.check() + + +if __name__ == "__main__": + unittest.main() From 38cd47370423b18522bc2a305b7e1bf8540a71f9 Mon Sep 17 00:00:00 2001 From: zhaoying9105 Date: Tue, 12 Jul 2022 12:31:02 +0800 Subject: [PATCH 136/250] [MLU]: set numpy random seed for test_hard_sigmoid_op_mlu.py and test_hard_swish_op_mlu.py (#44219) --- .../fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py | 3 ++- .../paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py index a38c12c900470..5050e2006f333 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_hard_sigmoid_op_mlu.py @@ -25,7 +25,8 @@ import paddle.nn.functional as F paddle.enable_static() -SEED = 2021 +SEED = 2022 +np.random.seed(SEED) def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5): diff --git a/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py index e0ae182b41d19..89475eb698533 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py @@ -25,7 +25,8 @@ sys.path.append("..") paddle.enable_static() -SEED = 2020 +SEED = 2021 +np.random.seed(SEED) def scalarToType(val, data_type): From 176a8832083372bca75e04a03ed6b1eeb147cd97 Mon Sep 17 00:00:00 2001 From: Chenxiao Niu Date: Tue, 12 Jul 2022 12:31:11 +0800 Subject: [PATCH 137/250] [MLU] fix mlu ctest for bilinear, relu6 and squared_l2_norm (#44230) --- .../tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py | 2 -- .../paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py | 6 +++--- .../tests/unittests/mlu/test_squared_l2_norm_op_mlu.py | 3 ++- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py index 9806a4f74307d..d7e53639490d2 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_bilinear_interp_v2_op_mlu.py @@ -576,8 +576,6 @@ def test_case(self): place = core.CPUPlace() with fluid.dygraph.guard(place): input_data = np.random.random((2, 3, 6, 6)).astype("float32") - input_data = np.load('input.npy').astype("float32") - # print(input_data) input_x = paddle.to_tensor(input_data) expect_res = bilinear_interp_np(input_data, out_h=12, diff --git a/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py index ffb6fee30f5e7..a6bb42878a684 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_relu6_op_mlu.py @@ -15,13 +15,13 @@ from __future__ import print_function import paddle.fluid as fluid import paddle +import sys + +sys.path.append("..") from op_test import OpTest import numpy as np import unittest -import sys - -sys.path.append("..") paddle.enable_static() SEED = 2021 diff --git a/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py index eee7a4db55d77..6a81c11c70b1b 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_squared_l2_norm_op_mlu.py @@ -24,6 +24,8 @@ import paddle from paddle import _C_ops +paddle.enable_static() + class TestL2LossOp(OpTest): """Test squared_l2_norm @@ -66,5 +68,4 @@ def test_main(self): if __name__ == "__main__": - paddle.enable_static() unittest.main() From 75aaa08a18a00d559ce75043fc3d0394f0ecdacc Mon Sep 17 00:00:00 2001 From: qipengh Date: Tue, 12 Jul 2022 14:27:24 +0800 Subject: [PATCH 138/250] [MLU]add elementwise_pow op (#44215) --- .../operators/elementwise/elementwise_mlu.h | 13 + .../elementwise/elementwise_pow_op_mlu.cc | 214 +++++++++++++++ .../mlu/test_elementwise_pow_op_mlu.py | 256 ++++++++++++++++++ 3 files changed, 483 insertions(+) create mode 100644 paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_elementwise_pow_op_mlu.py diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h index d5c85e9f71cc1..50085f531a99d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mlu.h +++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h @@ -122,6 +122,7 @@ enum BINARY_FUNCTOR { DIVNONAN, MAXIMUM, MINIMUM, + POW, }; template @@ -171,6 +172,18 @@ inline void MLUBinary(const framework::ExecutionContext& ctx, MLUCnnl::Minimum(ctx, in1_desc, in1, in2_desc, in2, out_desc, out); } +template <> +inline void MLUBinary(const framework::ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t y_desc, + const void* y, + const cnnlTensorDescriptor_t out_desc, + void* out) { + MLUCnnl::Pow(ctx, prefer, x_desc, x, y_desc, y, out_desc, out); +} + template void MLUBinaryOp(const framework::ExecutionContext& ctx) { auto* x = ctx.Input("X"); diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc new file mode 100644 index 0000000000000..431122641ec3d --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_mlu.cc @@ -0,0 +1,214 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise/elementwise_mlu.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ElementwisePowMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + MLUBinaryOp(ctx); + } +}; + +template +class ElementwisePowGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + auto place = ctx.GetPlace(); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + axis = + (axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis); + + int max_dim = std::max(x_dims.size(), y_dims.size()); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + cnnlDataType_t data_type = ToCnnlDataType(); + MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type); + MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type); + MLUCnnlTensorDesc out_desc(max_dim, out_dims_array.data(), data_type); + + auto dout_dims = dout->dims(); + if (dx) { + // dx = dout * y * pow(x, y - 1); + Tensor one_dx(y->type()); + one_dx.mutable_data(phi::make_ddim(y_dims_array), place); + FillMLUTensorWithHostValue(ctx, static_cast(1), &one_dx); + + Tensor sub_dx(y->type()); + sub_dx.mutable_data(phi::make_ddim(y_dims_array), place); + MLUCnnlOpTensorDesc op_tensor_desc( + CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN); + MLUCnnl::OpTensor(ctx, + op_tensor_desc.get(), + y_desc.get(), + GetBasePtr(y), + y_desc.get(), + GetBasePtr(&one_dx), + y_desc.get(), + GetBasePtr(&sub_dx), + data_type); + + Tensor tmp_dx(x->type()); + tmp_dx.mutable_data(phi::make_ddim(out_dims_array), place); + MLUCnnl::Pow(ctx, + CNNL_COMPUTATION_HIGH_PRECISION, + x_desc.get(), + GetBasePtr(x), + y_desc.get(), + GetBasePtr(&sub_dx), + out_desc.get(), + GetBasePtr(&tmp_dx)); + + MLUCnnl::MulAx(ctx, + y_desc.get(), + GetBasePtr(y), + out_desc.get(), + GetBasePtr(&tmp_dx)); + MLUCnnl::MulAx(ctx, + out_desc.get(), + GetBasePtr(dout), + out_desc.get(), + GetBasePtr(&tmp_dx)); + + if (x_dims != dout_dims) { + dx->mutable_data(place); + std::vector reduce_axes; + GetReduceAxes(axis, dout_dims, x_dims, &reduce_axes); + if (!reduce_axes.empty()) { + MLUCnnlReduceDesc reduction_desc(reduce_axes, + CNNL_REDUCE_ADD, + data_type, + CNNL_NOT_PROPAGATE_NAN, + CNNL_REDUCE_NO_INDICES, + CNNL_32BIT_INDICES); + MLUCnnlTensorDesc dx_desc(*dx); + MLUCnnl::Reduce(ctx, + true /*need_workspace*/, + reduction_desc.get(), + nullptr, + out_desc.get(), + GetBasePtr(&tmp_dx), + 0, + nullptr, + nullptr, + dx_desc.get(), + GetBasePtr(dx)); + } + } else { + dx->ShareDataWith(tmp_dx); + } + } + if (dy) { + // dy = dout * log(x) * pow(x, y) + Tensor tmp_dy(y->type()); + tmp_dy.mutable_data(phi::make_ddim(out_dims_array), place); + MLUCnnl::Pow(ctx, + CNNL_COMPUTATION_HIGH_PRECISION, + x_desc.get(), + GetBasePtr(x), + y_desc.get(), + GetBasePtr(y), + out_desc.get(), + GetBasePtr(&tmp_dy)); + + Tensor log_x(x->type()); + log_x.mutable_data(x->dims(), place); + MLUCnnl::Log(ctx, + CNNL_COMPUTATION_HIGH_PRECISION, + CNNL_LOG_E, + x_desc.get(), + GetBasePtr(x), + x_desc.get(), + GetBasePtr(&log_x)); + MLUCnnl::MulAx(ctx, + x_desc.get(), + GetBasePtr(&log_x), + out_desc.get(), + GetBasePtr(&tmp_dy)); + MLUCnnl::MulAx(ctx, + out_desc.get(), + GetBasePtr(dout), + out_desc.get(), + GetBasePtr(&tmp_dy)); + + if (y_dims != dout_dims) { + dy->mutable_data(place); + std::vector reduce_axes; + GetReduceAxes(axis, dout_dims, y_dims, &reduce_axes); + if (!reduce_axes.empty()) { + MLUCnnlReduceDesc reduction_desc(reduce_axes, + CNNL_REDUCE_ADD, + data_type, + CNNL_NOT_PROPAGATE_NAN, + CNNL_REDUCE_NO_INDICES, + CNNL_32BIT_INDICES); + MLUCnnlTensorDesc dy_desc(*dy); + MLUCnnl::Reduce(ctx, + true /*need_workspace*/, + reduction_desc.get(), + nullptr, + out_desc.get(), + GetBasePtr(&tmp_dy), + 0, + nullptr, + nullptr, + dy_desc.get(), + GetBasePtr(dy)); + } + } else { + dy->ShareDataWith(tmp_dy); + } + } + if (!dx && !dy) { + PADDLE_THROW(platform::errors::Unavailable( + "Not support all outputs to be empty.")); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(elementwise_pow, + ops::ElementwisePowMLUKernel, + ops::ElementwisePowMLUKernel); + +REGISTER_OP_MLU_KERNEL(elementwise_pow_grad, + ops::ElementwisePowGradMLUKernel, + ops::ElementwisePowGradMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_pow_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_pow_op_mlu.py new file mode 100644 index 0000000000000..7e04aed19c692 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_pow_op_mlu.py @@ -0,0 +1,256 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import paddle.fluid as fluid +import paddle + +import numpy as np +import unittest +import sys + +sys.path.append("..") +from op_test import OpTest + +paddle.enable_static() +SEED = 2022 + + +def ComputeGrad(x, y, out, axis): + grad = 1 / out.size + shape_x = x.shape + shape_y = y.shape + shape_out = out.shape + reduce_axes_x = [] + reduce_axes_y = [] + + if shape_x != shape_out: + if len(shape_x) < len(shape_out): + src_axis = axis + else: + src_axis = 0 + + for ax in range(len(shape_out)): + if (ax < src_axis or ax >= src_axis + len(shape_x)) or ( + shape_out[ax] > 1 and shape_x[ax - src_axis] == 1): + reduce_axes_x.append(ax) + + if shape_y != shape_out: + if len(shape_y) < len(shape_out): + src_axis = axis + else: + src_axis = 0 + + for ax in range(len(shape_out)): + if (ax < src_axis or ax >= src_axis + len(shape_y)) or ( + shape_out[ax] > 1 and shape_y[ax - src_axis] == 1): + reduce_axes_y.append(ax) + + if len(reduce_axes_x) > 0: + for i in reduce_axes_x: + x = np.expand_dims(x, axis=i) + + if len(reduce_axes_y) > 0: + for i in reduce_axes_y: + y = np.expand_dims(y, axis=i) + + dx = y * np.power(x, y - 1) * grad + dy = np.log(x) * np.power(x, y) * grad + + if len(reduce_axes_x) > 0: + for i, element in enumerate(reduce_axes_x): + dx = np.add.reduce(dx, element - i) + + if len(reduce_axes_y) > 0: + for i, element in enumerate(reduce_axes_y): + dy = np.add.reduce(dy, element - i) + + return dx, dy + + +class TestElementwisePow(OpTest): + + def setUp(self): + self.set_mlu() + self.op_type = "elementwise_pow" + + self.init_dtype() + self.init_input_output() + self.init_axis() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis} + self.outputs = {'Out': self.out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_axis(self): + self.axis = -1 + + def init_input_output(self): + np.random.seed(SEED) + self.x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + self.out = np.power(self.x, self.y) + + def test_check_grad_normal(self): + dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['X', 'Y'], + 'Out', + user_defined_grads=[dx, dy]) + + def test_check_grad_ingore_x(self): + _, dy = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['Y'], + 'Out', + no_grad_set=set("X"), + user_defined_grads=[dy]) + + def test_check_grad_ingore_y(self): + dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['X'], + 'Out', + no_grad_set=set("Y"), + user_defined_grads=[dx]) + + +class TestElementwisePowFp16(TestElementwisePow): + + def init_input_output(self): + np.random.seed(SEED) + self.x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + self.out = np.power(self.x, self.y) + + def set_mlu(self): + self.__class__.use_mlu = True + # self.__class__.no_need_check_grad = True + self.place = paddle.device.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-5) + + +class TestElementwisePowOp_broadcast_0(TestElementwisePow): + + def init_axis(self): + self.axis = 1 + + def init_input_output(self): + np.random.seed(SEED) + self.x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [1, 11, 17]).astype(self.dtype) + self.out = np.power(self.x, self.y) + + def test_check_grad_normal(self): + dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['X', 'Y'], + 'Out', + user_defined_grads=[dx, dy]) + + def test_check_grad_ingore_x(self): + _, dy = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['Y'], + 'Out', + no_grad_set=set("X"), + user_defined_grads=[dy]) + + def test_check_grad_ingore_y(self): + dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['X'], + 'Out', + no_grad_set=set("Y"), + user_defined_grads=[dx]) + + +class TestElementwisePowOp_broadcast_1(TestElementwisePow): + + def init_axis(self): + self.axis = 1 + + def init_input_output(self): + np.random.seed(SEED) + self.x = np.random.uniform(1, 2, [2, 100, 1]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [100]).astype(self.dtype) + self.out = np.power(self.x, self.y.reshape(1, 100, 1)) + + def test_check_grad_normal(self): + dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['X', 'Y'], + 'Out', + user_defined_grads=[dx, dy]) + + def test_check_grad_ingore_x(self): + _, dy = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['Y'], + 'Out', + no_grad_set=set("X"), + user_defined_grads=[dy]) + + def test_check_grad_ingore_y(self): + dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['X'], + 'Out', + no_grad_set=set("Y"), + user_defined_grads=[dx]) + + +class TestElementwisePowOp_broadcast_2(TestElementwisePow): + + def init_axis(self): + self.axis = 0 + + def init_input_output(self): + np.random.seed(SEED) + self.x = np.random.uniform(0.1, 1, [100, 3, 1]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [100]).astype(self.dtype) + self.out = np.power(self.x, self.y.reshape(100, 1, 1)) + + def test_check_grad_normal(self): + dx, dy = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['X', 'Y'], + 'Out', + user_defined_grads=[dx, dy]) + + def test_check_grad_ingore_x(self): + _, dy = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['Y'], + 'Out', + no_grad_set=set("X"), + user_defined_grads=[dy]) + + def test_check_grad_ingore_y(self): + dx, _ = ComputeGrad(self.x, self.y, self.out, self.axis) + self.check_grad_with_place(self.place, ['X'], + 'Out', + no_grad_set=set("Y"), + user_defined_grads=[dx]) + + +if __name__ == '__main__': + unittest.main() From f1be9cf1d0f3160bc211777f969a1592a02e68fb Mon Sep 17 00:00:00 2001 From: qipengh Date: Tue, 12 Jul 2022 14:36:02 +0800 Subject: [PATCH 139/250] [MLU]add sync_batch_norm op (#44176) --- paddle/fluid/operators/CMakeLists.txt | 4 + paddle/fluid/operators/mlu/mlu_baseop.cc | 229 +++++++- paddle/fluid/operators/mlu/mlu_baseop.h | 153 ++++++ .../fluid/operators/sync_batch_norm_op_mlu.cc | 492 ++++++++++++++++++ .../fluid/tests/unittests/mlu/CMakeLists.txt | 2 + .../unittests/mlu/sync_batch_norm_op_mlu.py | 105 ++++ 6 files changed, 974 insertions(+), 11 deletions(-) create mode 100644 paddle/fluid/operators/sync_batch_norm_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 17aabc25b3fa4..7fb00504ee2db 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -149,6 +149,10 @@ if (WITH_ASCEND_CL) op_library(sync_batch_norm_op) endif() +if (WITH_MLU) + op_library(sync_batch_norm_op) +endif() + op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute) op_library(eye_op DEPS ${OP_HEADER_DEPS}) op_library(recurrent_op DEPS ${OP_HEADER_DEPS}) diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 5531250f363b5..175fa9f94470f 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -259,15 +259,16 @@ MLUCnnlTensorDesc::~MLUCnnlTensorDesc() { MLUCnnlActivationDesc::MLUCnnlActivationDesc( const cnnlActivationMode_t act_mode, const float ceof) { PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_)); - PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetActivationDescriptor_v4( - active_desc_, - act_mode, - CNNL_ACTIVATION_HIGH_PRECISION, - CNNL_NOT_PROPAGATE_NAN, - ceof, - 1.0f /*sliced_dim*/, - 1.67326319217681884765625 /*selu_alpha*/, - 1.05070102214813232421875 /*selu_lambda*/)); + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSetActivationDescriptor_v5(active_desc_, + act_mode, + CNNL_ACTIVATION_HIGH_PRECISION, + CNNL_NOT_PROPAGATE_NAN, + ceof, + 1.0f /*sliced_dim*/, + 1.67326319217681884765625 /*selu_alpha*/, + 1.05070102214813232421875 /*selu_lambda*/, + false /*is_elu_mode*/)); } MLUCnnlActivationDesc::MLUCnnlActivationDesc( @@ -278,14 +279,15 @@ MLUCnnlActivationDesc::MLUCnnlActivationDesc( const float selu_lambda) { PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_)); PADDLE_ENFORCE_MLU_SUCCESS( - cnnlSetActivationDescriptor_v4(active_desc_, + cnnlSetActivationDescriptor_v5(active_desc_, act_mode, CNNL_ACTIVATION_HIGH_PRECISION, CNNL_NOT_PROPAGATE_NAN, ceof, sliced_dim, selu_alpha, - selu_lambda)); + selu_lambda, + false /*is_elu_mode*/)); } const cnnlActivationDescriptor_t MLUCnnlActivationDesc::get() const { @@ -2350,6 +2352,36 @@ MLURNNDesc::~MLURNNDesc() { workspace_size)); } +/* static */ void MLUCnnl::Pow(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetPowWorkspaceSize( + handle, input1_desc, input2_desc, output_desc, &workspace_size)); + + auto& dev_ctx = GetDevCtxFromCTX(ctx); + Tensor workspace = ctx.AllocateTmpTensor( + {static_cast(workspace_size)}, dev_ctx); + void* workspace_ptr = workspace.mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlPow(handle, + prefer, + input1_desc, + input1, + input2_desc, + input2, + workspace_ptr, + workspace_size, + output_desc, + output)); +} + /* static */ void MLUCnnl::PowR(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t input1_desc, @@ -4895,5 +4927,180 @@ MLURNNDesc::~MLURNNDesc() { grads_image)); } +/* static */ void MLUCnnl::SyncBatchNormStats( + const ExecutionContext& ctx, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const float eps, + const cnnlTensorDescriptor_t mean_desc, + void* mean, + const cnnlTensorDescriptor_t invstd_desc, + void* invstd) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSyncBatchNormStats( + handle, x_desc, x, eps, mean_desc, mean, invstd_desc, invstd)); +} + +/* static */ void MLUCnnl::SyncBatchNormGatherStatsWithCounts( + const ExecutionContext& ctx, + float momentum, + float eps, + const cnnlTensorDescriptor_t mean_all_desc, + const void* mean_all, + const cnnlTensorDescriptor_t invstd_all_desc, + const void* invstd_all, + const cnnlTensorDescriptor_t moving_mean_desc, + void* moving_mean, + const cnnlTensorDescriptor_t moving_var_desc, + void* moving_var, + const cnnlTensorDescriptor_t count_all_desc, + const void* count_all, + const cnnlTensorDescriptor_t mean_desc, + void* mean, + const cnnlTensorDescriptor_t invstd_desc, + void* invstd) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSyncBatchNormGatherStatsWithCounts(handle, + mean_all_desc, + mean_all, + invstd_all_desc, + invstd_all, + moving_mean_desc, + moving_mean, + moving_var_desc, + moving_var, + momentum, + eps, + count_all_desc, + count_all, + mean_desc, + mean, + invstd_desc, + invstd)); +} + +/* static */ void MLUCnnl::SyncBatchNormElemt( + const ExecutionContext& ctx, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t mean_desc, + const void* mean, + const cnnlTensorDescriptor_t invstd_desc, + const void* invstd, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t bias_desc, + const void* bias, + const cnnlTensorDescriptor_t y_desc, + void* y) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSyncBatchNormElemt(handle, + x_desc, + x, + mean_desc, + mean, + invstd_desc, + invstd, + weight_desc, + weight, + bias_desc, + bias, + y_desc, + y)); +} + +/* static */ void MLUCnnl::SyncBatchnormBackwardReduce( + const ExecutionContext& ctx, + const cnnlTensorDescriptor_t desc_dz, + const void* dz, + const cnnlTensorDescriptor_t desc_x, + const void* x, + const cnnlTensorDescriptor_t desc_mean, + const void* mean, + const cnnlTensorDescriptor_t desc_invstd, + const void* invstd, + const cnnlTensorDescriptor_t desc_dweight, + void* dweight, + const cnnlTensorDescriptor_t desc_dbias, + void* dbias, + const cnnlTensorDescriptor_t desc_sum_dy, + void* sum_dy, + const cnnlTensorDescriptor_t desc_sum_dy_xmu, + void* sum_dy_xmu, + const bool needs_input_grad0, + const bool needs_input_grad1, + const bool needs_input_grad2) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlSyncBatchnormBackwardReduce(handle, + desc_dz, + dz, + desc_x, + x, + desc_mean, + mean, + desc_invstd, + invstd, + desc_dweight, + dweight, + desc_dbias, + dbias, + desc_sum_dy, + sum_dy, + desc_sum_dy_xmu, + sum_dy_xmu, + needs_input_grad0, + needs_input_grad1, + needs_input_grad2)); +} + +/* static */ void MLUCnnl::SyncBatchNormBackwardElemt( + const ExecutionContext& ctx, + const cnnlTensorDescriptor_t diff_y_desc, + const void* diff_y, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t mean_desc, + const void* mean, + const cnnlTensorDescriptor_t invstd_desc, + const void* invstd, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t sum_dy_desc, + const void* sum_dy, + const cnnlTensorDescriptor_t sum_dy_xmu_desc, + const void* sum_dy_xmu, + const cnnlTensorDescriptor_t count_desc, + const void* count, + const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSyncBatchNormBackwardElemtV2(handle, + diff_y_desc, + diff_y, + x_desc, + x, + mean_desc, + mean, + invstd_desc, + invstd, + weight_desc, + weight, + sum_dy_desc, + sum_dy, + sum_dy_xmu_desc, + sum_dy_xmu, + count_desc, + count, + diff_x_desc, + diff_x)); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 07c5031ee2eb1..0d4c7d2e5a329 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -1276,6 +1276,15 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_desc, void* output); + static void Pow(const ExecutionContext& ctx, + cnnlComputationPreference_t prefer, + const cnnlTensorDescriptor_t input1_desc, + const void* input1, + const cnnlTensorDescriptor_t input2_desc, + const void* input2, + const cnnlTensorDescriptor_t output_desc, + void* output); + static void PowR(const ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t input1_desc, @@ -2030,8 +2039,152 @@ class MLUCnnl { const void* boxes, const cnnlTensorDescriptor_t grads_image_desc, void* grads_image); + + static void SyncBatchNormStats(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const float eps, + const cnnlTensorDescriptor_t mean_desc, + void* mean, + const cnnlTensorDescriptor_t invstd_desc, + void* invstd); + + static void SyncBatchNormGatherStatsWithCounts( + const ExecutionContext& ctx, + float momentum, + float eps, + const cnnlTensorDescriptor_t mean_all_desc, + const void* mean_all, + const cnnlTensorDescriptor_t invstd_all_desc, + const void* invstd_all, + const cnnlTensorDescriptor_t moving_mean_desc, + void* moving_mean, + const cnnlTensorDescriptor_t moving_var_desc, + void* moving_var, + const cnnlTensorDescriptor_t count_all_desc, + const void* count_all, + const cnnlTensorDescriptor_t mean_desc, + void* mean, + const cnnlTensorDescriptor_t invstd_desc, + void* invstd); + + static void SyncBatchNormElemt(const ExecutionContext& ctx, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t mean_desc, + const void* mean, + const cnnlTensorDescriptor_t invstd_desc, + const void* invstd, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t bias_desc, + const void* bias, + const cnnlTensorDescriptor_t y_desc, + void* y); + + static void SyncBatchnormBackwardReduce( + const ExecutionContext& ctx, + const cnnlTensorDescriptor_t desc_dz, + const void* dz, + const cnnlTensorDescriptor_t desc_x, + const void* x, + const cnnlTensorDescriptor_t desc_mean, + const void* mean, + const cnnlTensorDescriptor_t desc_invstd, + const void* invstd, + const cnnlTensorDescriptor_t desc_dweight, + void* dweight, + const cnnlTensorDescriptor_t desc_dbias, + void* dbias, + const cnnlTensorDescriptor_t desc_sum_dy, + void* sum_dy, + const cnnlTensorDescriptor_t desc_sum_dy_xmu, + void* sum_dy_xmu, + const bool needs_input_grad0, + const bool needs_input_grad1, + const bool needs_input_grad2); + + static void SyncBatchNormBackwardElemt( + const ExecutionContext& ctx, + const cnnlTensorDescriptor_t diff_y_desc, + const void* diff_y, + const cnnlTensorDescriptor_t x_desc, + const void* x, + const cnnlTensorDescriptor_t mean_desc, + const void* mean, + const cnnlTensorDescriptor_t invstd_desc, + const void* invstd, + const cnnlTensorDescriptor_t weight_desc, + const void* weight, + const cnnlTensorDescriptor_t sum_dy_desc, + const void* sum_dy, + const cnnlTensorDescriptor_t sum_dy_xmu_desc, + const void* sum_dy_xmu, + const cnnlTensorDescriptor_t count_desc, + const void* count, + const cnnlTensorDescriptor_t diff_x_desc, + void* diff_x); }; +const std::map, std::vector>> + TransPermMap = { + // trans_mode, (forward_perm, backward_perm) + {"3D_NCHW2NHWC", {{0, 2, 1}, {0, 2, 1}}}, + {"4D_NCHW2NHWC", {{0, 2, 3, 1}, {0, 3, 1, 2}}}, + {"5D_NCHWD2NDHWC", {{0, 4, 2, 3, 1}, {0, 4, 2, 3, 1}}}, + {"5D_NHWDC2NDHWC", {{0, 3, 1, 2, 4}, {0, 2, 3, 4, 1}}}}; + +inline void SetMLUTransposePerm(const framework::DDim& dims, + const DataLayout& data_layout, + std::vector* forward_perm, + std::vector* backward_perm, + std::vector* out_shape) { + const int dim_size = dims.size(); + PADDLE_ENFORCE_EQ((dim_size >= 3) && (dim_size <= 5), + true, + platform::errors::InvalidArgument( + "MLUTransposePerm func only support (dim_size >= 3) && " + "(dim_size <= 5), but now dim_size is %d.", + dim_size)); + + PADDLE_ENFORCE_EQ( + (data_layout == DataLayout::kNCHW) || (data_layout == DataLayout::kNHWC), + true, + platform::errors::InvalidArgument( + "MLUTransposePerm func only support DataLayout: kNCHW or kNHWC, but " + "now data_layout is %s.", + data_layout)); + + // case 1: NCHW of Paddle != NHWC of MLU when dims==3,4 + // case 2: NHWDC and NCHWD of Paddle != NDHWC of MLU when dims==5 + std::string map_key = ""; + if (data_layout == DataLayout::kNCHW) { + switch (dim_size) { + case 3: + map_key = "3D_NCHW2NHWC"; + break; + case 4: + map_key = "4D_NCHW2NHWC"; + break; + case 5: + map_key = "5D_NCHWD2NDHWC"; + break; + } + } else if (data_layout == DataLayout::kNHWC && dim_size == 5) { + map_key = "5D_NHWDC2NDHWC"; + } + assert(map_key != ""); + forward_perm->assign(TransPermMap.at(map_key).first.begin(), + TransPermMap.at(map_key).first.end()); + backward_perm->assign(TransPermMap.at(map_key).second.begin(), + TransPermMap.at(map_key).second.end()); + + auto in_dims = phi::vectorize(dims); + for (size_t i = 0; i < in_dims.size(); i++) { + out_shape->push_back(in_dims[forward_perm->at(i)]); + } +} + template inline void TransposeFromMLUTensor(const ExecutionContext& ctx, const std::vector perm, diff --git a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc new file mode 100644 index 0000000000000..ce511a12bbfdb --- /dev/null +++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc @@ -0,0 +1,492 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/platform/collective_helper.h" +#if defined(PADDLE_WITH_CNCL) +#include "paddle/fluid/platform/device/mlu/cncl_helper.h" +#endif +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +#define GET_LAYOUT_OFFSET 2 +using Tensor = framework::Tensor; +static std::vector supported_input_layout = { + CNNL_LAYOUT_NC, CNNL_LAYOUT_NLC, CNNL_LAYOUT_NHWC, CNNL_LAYOUT_NDHWC}; + +template +class SyncBatchNormMLUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext &ctx) const override { + float epsilon = ctx.Attr("epsilon"); + float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const bool use_global_stats = ctx.Attr("use_global_stats"); + const bool trainable_stats = ctx.Attr("trainable_statistics"); + const std::string layout_str = ctx.Attr("data_layout"); + const DataLayout layout = framework::StringToDataLayout(layout_str); + + PADDLE_ENFORCE_EQ(use_global_stats, + false, + platform::errors::InvalidArgument( + "sync_batch_norm doesn't support " + "to set use_global_stats True. Please use batch_norm " + "in this case.")); + + const auto *x = ctx.Input("X"); + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + const auto *mean = ctx.Input("Mean"); + const auto *variance = ctx.Input("Variance"); + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + auto *y = ctx.Output("Y"); + + const auto &x_dims = x->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), + 2, + platform::errors::InvalidArgument( + "The Input dim size should be larger than 1.")); + PADDLE_ENFORCE_LE(x_dims.size(), + 5, + platform::errors::InvalidArgument( + "The Input dim size should be less than 6.")); + + int N, C, H, W, D; + ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + + y->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); + + Tensor trans_x; + Tensor trans_y; + std::vector forward_perm; + std::vector backward_perm; + std::vector trans_shape; + const bool need_transpose = + ((layout == DataLayout::kNCHW && x_dims.size() != 2) || + x_dims.size() == 5); + if (need_transpose) { + SetMLUTransposePerm( + x_dims, layout, &forward_perm, &backward_perm, &trans_shape); + trans_x.mutable_data(phi::make_ddim(trans_shape), ctx.GetPlace()); + trans_y.mutable_data(phi::make_ddim(trans_shape), ctx.GetPlace()); + MLUCnnlTensorDesc desc_x(*x); + MLUCnnlTensorDesc desc_trans_x( + trans_shape.size(), trans_shape.data(), ToCnnlDataType(x->dtype())); + MLUCnnl::Transpose(ctx, + forward_perm, + x_dims.size(), + desc_x.get(), + GetBasePtr(x), + desc_trans_x.get(), + GetBasePtr(&trans_x)); + } else { + trans_x = *x; + trans_y = *y; + } + + MLUCnnlTensorDesc desc_trans( + trans_x, + supported_input_layout[x_dims.size() - GET_LAYOUT_OFFSET], + ToCnnlDataType()); + + bool test_mode = is_test && (!trainable_stats); + if (test_mode) { // inference + MLUCnnlTensorDesc desc_weight_bias_mean_var(*bias); + MLUCnnl::FusedBatchNorm(ctx, + false /*is_training*/, + desc_trans.get(), + GetBasePtr(&trans_x), + desc_weight_bias_mean_var.get(), + GetBasePtr(scale), + GetBasePtr(bias), + GetBasePtr(mean), + GetBasePtr(variance), + epsilon, + momentum, + desc_trans.get(), + GetBasePtr(&trans_y), + nullptr, + nullptr, + nullptr, + nullptr); + } else { // training + if (ctx.HasInput("MomentumTensor")) { + const auto *mom_tensor = ctx.Input("MomentumTensor"); + Tensor mom_cpu; + paddle::framework::TensorCopySync( + *mom_tensor, platform::CPUPlace(), &mom_cpu); + momentum = mom_cpu.data()[0]; + } + + Tensor local_mean, local_var; + local_mean.mutable_data(mean->dims(), ctx.GetPlace()); + local_var.mutable_data(variance->dims(), ctx.GetPlace()); + MLUCnnlTensorDesc desc_mean_var(*mean_out); + + // cacl local_mean and local_var + MLUCnnl::SyncBatchNormStats(ctx, + desc_trans.get(), + GetBasePtr(&trans_x), + epsilon, + desc_mean_var.get(), + GetBasePtr(&local_mean), + desc_mean_var.get(), + GetBasePtr(&local_var)); + + Tensor input_count; + input_count.mutable_data(phi::make_ddim({1}), ctx.GetPlace()); + FillMLUTensorWithHostValue( + ctx, static_cast(x->numel() / C), &input_count); + + Tensor count_all; + Tensor mean_all(mean->dtype()); + Tensor invstd_all(variance->dtype()); + + auto &dev_ctx = + ctx.template device_context(); + auto stream = dev_ctx.stream(); + auto *comm = dev_ctx.cncl_comm(); + if (comm) { + auto *comm = paddle::platform::CNCLCommContext::Instance() + .Get(0, ctx.GetPlace()) + ->comm(); + int count; + PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCommCount(&count, comm)); + count_all.mutable_data(phi::make_ddim({count}), ctx.GetPlace()); + cnclDataType_t dtype = platform::ToCNCLDataType( + framework::TransToProtoVarType(count_all.dtype())); + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&input_count), + GetBasePtr(&count_all), + 1, + dtype, + comm, + stream)); + + mean_all.mutable_data(phi::make_ddim({count, mean->numel()}), + ctx.GetPlace()); + invstd_all.mutable_data( + phi::make_ddim({count, variance->numel()}), ctx.GetPlace()); + + auto cncl_dtype = platform::ToCNCLDataType( + framework::TransToProtoVarType(mean_all.dtype())); + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&local_mean), + GetBasePtr(&mean_all), + local_mean.numel(), + cncl_dtype, + comm, + stream)); + + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(GetBasePtr(&local_var), + GetBasePtr(&invstd_all), + local_var.numel(), + cncl_dtype, + comm, + stream)); + + } else { + count_all = input_count; + mean_all.ShareDataWith(local_mean); + invstd_all.ShareDataWith(local_var); + mean_all.Resize(phi::make_ddim({1, local_mean.numel()})); + invstd_all.Resize(phi::make_ddim({1, local_var.numel()})); + } + + MLUCnnlTensorDesc desc_all_mean_invstd( + invstd_all, CNNL_LAYOUT_NC, ToCnnlDataType()); + MLUCnnlTensorDesc desc_moving_mean_var(*mean_out); + MLUCnnlTensorDesc desc_saved_mean_var(*saved_mean); + MLUCnnlTensorDesc desc_count_all(count_all); + + MLUCnnl::SyncBatchNormGatherStatsWithCounts(ctx, + momentum, + epsilon, + desc_all_mean_invstd.get(), + GetBasePtr(&mean_all), + desc_all_mean_invstd.get(), + GetBasePtr(&invstd_all), + desc_moving_mean_var.get(), + GetBasePtr(mean_out), + desc_moving_mean_var.get(), + GetBasePtr(variance_out), + desc_count_all.get(), + GetBasePtr(&count_all), + desc_saved_mean_var.get(), + GetBasePtr(saved_mean), + desc_saved_mean_var.get(), + GetBasePtr(saved_variance)); + + MLUCnnlTensorDesc desc_other_param(*saved_mean); + MLUCnnl::SyncBatchNormElemt(ctx, + desc_trans.get(), + GetBasePtr(&trans_x), + desc_other_param.get(), + GetBasePtr(saved_mean), + desc_other_param.get(), + GetBasePtr(saved_variance), + desc_other_param.get(), + GetBasePtr(scale), + desc_other_param.get(), + GetBasePtr(bias), + desc_trans.get(), + GetBasePtr(&trans_y)); + } + if (need_transpose) { + MLUCnnlTensorDesc desc_y(*y); + MLUCnnlTensorDesc desc_trans_y(trans_y); + MLUCnnl::Transpose(ctx, + backward_perm, + trans_y.dims().size(), + desc_trans_y.get(), + GetBasePtr(&trans_y), + desc_y.get(), + GetBasePtr(y)); + } + } +}; + +template +class SyncBatchNormMLUGradKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const std::string layout_str = ctx.Attr("data_layout"); + const DataLayout layout = framework::StringToDataLayout(layout_str); + + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_inv_var = ctx.Input("SavedVariance"); + + const Tensor *x; + if (ctx.HasInput("Y")) { + PADDLE_ENFORCE_EQ(true, + false, + platform::errors::InvalidArgument( + "sync_batch_norm_grad doesn't support input Y")); + } else { + x = ctx.Input("X"); + } + + const auto &x_dims = x->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), + 2, + platform::errors::InvalidArgument( + "The Input X dim size should be larger than 1.")); + PADDLE_ENFORCE_LE(x_dims.size(), + 5, + platform::errors::InvalidArgument( + "The Input X dim size should be less than 6.")); + + int N, C, H, W, D; + ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + PADDLE_ENFORCE_EQ(scale->dims()[0], + C, + platform::errors::InvalidArgument( + "Expected first dim for input parameter(scale) of " + "OP(sync_batch_norm) be (%d), but given (%d).", + C, + scale->dims()[0])); + + d_x->mutable_data(ctx.GetPlace()); + if (d_scale && d_bias) { + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + } + PADDLE_ENFORCE_EQ(scale->dims().size(), + 1UL, + platform::errors::InvalidArgument( + "Expected rank for input parameter(scale) of " + "OP(sync_batch_norm) be (1), but given (%d).", + scale->dims().size())); + + Tensor trans_x; + Tensor trans_dy; + Tensor trans_dx; + std::vector forward_perm; + std::vector backward_perm; + std::vector trans_shape; + const bool need_transpose = + ((layout == DataLayout::kNCHW && x_dims.size() != 2) || + x_dims.size() == 5); + if (need_transpose) { + SetMLUTransposePerm( + x_dims, layout, &forward_perm, &backward_perm, &trans_shape); + trans_x.mutable_data(phi::make_ddim(trans_shape), ctx.GetPlace()); + trans_dy.mutable_data(phi::make_ddim(trans_shape), ctx.GetPlace()); + trans_dx.mutable_data(phi::make_ddim(trans_shape), ctx.GetPlace()); + MLUCnnlTensorDesc desc_x(*x); + MLUCnnlTensorDesc desc_trans_x( + trans_shape.size(), trans_shape.data(), ToCnnlDataType(x->dtype())); + MLUCnnl::Transpose(ctx, + forward_perm, + x_dims.size(), + desc_x.get(), + GetBasePtr(x), + desc_trans_x.get(), + GetBasePtr(&trans_x)); + MLUCnnl::Transpose(ctx, + forward_perm, + x_dims.size(), + desc_x.get(), + GetBasePtr(d_y), + desc_trans_x.get(), + GetBasePtr(&trans_dy)); + } else { + trans_x = *x; + trans_dy = *d_y; + trans_dx = *d_x; + } + MLUCnnlTensorDesc desc_trans( + trans_x, + supported_input_layout[x_dims.size() - GET_LAYOUT_OFFSET], + ToCnnlDataType()); + + Tensor sum_dy, sum_dy_xmu; + sum_dy.mutable_data(bias->dims(), ctx.GetPlace()); + sum_dy_xmu.mutable_data(bias->dims(), ctx.GetPlace()); + MLUCnnlTensorDesc desc_other_param(*bias); + + MLUCnnl::SyncBatchnormBackwardReduce( + ctx, + desc_trans.get(), + GetBasePtr(&trans_dy), + desc_trans.get(), + GetBasePtr(&trans_x), + desc_other_param.get(), + GetBasePtr(saved_mean), + desc_other_param.get(), + GetBasePtr(saved_inv_var), + d_scale ? desc_other_param.get() : nullptr, + d_scale ? GetBasePtr(d_scale) : nullptr, + d_bias ? desc_other_param.get() : nullptr, + d_bias ? GetBasePtr(d_bias) : nullptr, + desc_other_param.get(), + GetBasePtr(&sum_dy), + desc_other_param.get(), + GetBasePtr(&sum_dy_xmu), + true /*compute sum_dy, sum_dy_xmu*/, + d_scale ? true : false /*compute d_scale*/, + d_bias ? true : false /*compute d_bias*/); + + Tensor numel_count; + numel_count.mutable_data(phi::make_ddim({1}), ctx.GetPlace()); + FillMLUTensorWithHostValue( + ctx, static_cast(x->numel() / C), &numel_count); + + auto &dev_ctx = + ctx.template device_context(); + auto stream = dev_ctx.stream(); + auto *comm = dev_ctx.cncl_comm(); + if (comm) { + auto *comm = paddle::platform::CNCLCommContext::Instance() + .Get(0, ctx.GetPlace()) + ->comm(); + cnclDataType_t dtype = platform::ToCNCLDataType( + framework::TransToProtoVarType(numel_count.dtype())); + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&numel_count), + GetBasePtr(&numel_count), + 1, + dtype, + cnclSum, + comm, + stream)); + + auto cncl_dtype = platform::ToCNCLDataType( + framework::TransToProtoVarType(sum_dy.dtype())); + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&sum_dy), + GetBasePtr(&sum_dy), + sum_dy.numel(), + cncl_dtype, + cnclSum, + comm, + stream)); + + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(GetBasePtr(&sum_dy_xmu), + GetBasePtr(&sum_dy_xmu), + sum_dy_xmu.numel(), + cncl_dtype, + cnclSum, + comm, + stream)); + } + + if (d_x) { + MLUCnnlTensorDesc desc_count(numel_count); + MLUCnnl::SyncBatchNormBackwardElemt(ctx, + desc_trans.get(), + GetBasePtr(&trans_dy), + desc_trans.get(), + GetBasePtr(&trans_x), + desc_other_param.get(), + GetBasePtr(saved_mean), + desc_other_param.get(), + GetBasePtr(saved_inv_var), + desc_other_param.get(), + GetBasePtr(scale), + desc_other_param.get(), + GetBasePtr(&sum_dy), + desc_other_param.get(), + GetBasePtr(&sum_dy_xmu), + desc_count.get(), + GetBasePtr(&numel_count), + desc_trans.get(), + GetBasePtr(&trans_dx)); + + if (need_transpose) { + MLUCnnlTensorDesc desc_dx(*d_x); + MLUCnnlTensorDesc desc_trans_dx(trans_dx); + MLUCnnl::Transpose(ctx, + backward_perm, + trans_dx.dims().size(), + desc_trans_dx.get(), + GetBasePtr(&trans_dx), + desc_dx.get(), + GetBasePtr(d_x)); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_MLU_KERNEL(sync_batch_norm, + ops::SyncBatchNormMLUKernel, + ops::SyncBatchNormMLUKernel); + +REGISTER_OP_MLU_KERNEL(sync_batch_norm_grad, + ops::SyncBatchNormMLUGradKernel, + ops::SyncBatchNormMLUGradKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt index cac8e95521d31..385879c08a72f 100644 --- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt @@ -50,5 +50,7 @@ if(WITH_MLU) set_tests_properties(test_collective_allgather_api_mlu PROPERTIES TIMEOUT 120) set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120) + set_tests_properties(test_sync_batch_norm_op_mlu_baseline PROPERTIES TIMEOUT + 120) endif() endif() diff --git a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py new file mode 100644 index 0000000000000..4f80523a18254 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py @@ -0,0 +1,105 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import os +import sys + +sys.path.append("..") +import signal +import time +from contextlib import closing +from six import string_types +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +import paddle.fluid.unique_name as nameGen +from paddle.fluid import core +import unittest +from multiprocessing import Process +import paddle.fluid.layers as layers +from functools import reduce +from test_sync_batch_norm_base_mlu import TestSyncBatchNormRunnerBase, runtime_main +from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator + +from paddle.fluid.tests.unittests.test_sync_batch_norm_op import create_or_get_tensor + +_set_use_system_allocator(False) +paddle.enable_static() + + +class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase): + + def __init__(self): + self.global_ring_id = 0 + + self.dtype = np.float32 + self.N = 8 + self.C = 16 + self.H = 32 + self.W = 32 + self.dshape = [self.N, self.C, self.H, self.W] + self.atol = 1e-3 + + def get_model(self, + main, + startup, + place, + layout, + seed, + sync_bn=False, + only_forward=False): + """Build program.""" + use_cudnn = False + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + data = fluid.layers.data(name='input', + shape=self.dshape, + dtype=self.dtype, + append_batch_size=False) + conv = fluid.layers.conv2d( + input=data, + num_filters=32, + filter_size=1, + param_attr=fluid.ParamAttr(name='conv2d_weight'), + bias_attr=False, + use_cudnn=use_cudnn) + bn = fluid.layers.batch_norm( + conv, + param_attr=fluid.ParamAttr(name='bn_scale'), + bias_attr=fluid.ParamAttr(name='bn_bias'), + moving_mean_name='bn_moving_mean', + moving_variance_name='bn_moving_variance', + data_layout=layout, + is_test=only_forward) + # if self.dtype == np.float16: + # bn = fluid.layers.cast(bn, 'float32') + sigmoid = fluid.layers.sigmoid(bn) + out = fluid.layers.reduce_sum(sigmoid) + # if not sync_bn: + # out = out / core.get_mlu_device_count() + if not only_forward: + sgd_opt = fluid.optimizer.SGD(learning_rate=0.0) + sgd_opt.backward(out) + return [out, conv, bn] + + +if __name__ == "__main__": + # print('sync_batch_norm_op_mlu.py __main__') + + runtime_main(TestSyncBatchNormOpTraining, "identity", 0) From 51e2933dea080c390affe13e6f358eaa92caca73 Mon Sep 17 00:00:00 2001 From: qipengh Date: Tue, 12 Jul 2022 14:36:11 +0800 Subject: [PATCH 140/250] [MLU]add sync_batch_norm op 2/2 (#44178) --- paddle/fluid/operators/scale_op_mlu.cc | 2 +- .../platform/device/mlu/device_context.cc | 23 +- .../mlu/test_sync_batch_norm_base_mlu.py | 506 ++++++++++++++++++ .../test_sync_batch_norm_op_mlu_baseline.py | 43 ++ .../mlu/test_sync_batch_norm_op_mlu_extra.py | 177 ++++++ 5 files changed, 737 insertions(+), 14 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py diff --git a/paddle/fluid/operators/scale_op_mlu.cc b/paddle/fluid/operators/scale_op_mlu.cc index 7acaad8ddaad6..363c3e98a6dfc 100644 --- a/paddle/fluid/operators/scale_op_mlu.cc +++ b/paddle/fluid/operators/scale_op_mlu.cc @@ -21,7 +21,7 @@ namespace operators { template class ScaleMLUKernel : public framework::OpKernel { public: - virtual void Compute(const framework::ExecutionContext& ctx) const { + void Compute(const framework::ExecutionContext& ctx) const { auto& dev_ctx = GetDevCtxFromCTX(ctx); auto* in_var = ctx.InputVar("X"); auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var); diff --git a/paddle/fluid/platform/device/mlu/device_context.cc b/paddle/fluid/platform/device/mlu/device_context.cc index c3c5546a12a2e..087b4803320e5 100644 --- a/paddle/fluid/platform/device/mlu/device_context.cc +++ b/paddle/fluid/platform/device/mlu/device_context.cc @@ -42,19 +42,16 @@ MLUDeviceContext::MLUDeviceContext(MLUPlace place) : place_(place) { runtime_version_ = GetMLURuntimeVersion(place_.device); cnnl_version_ = GetMLUCnnlVersion(place_.device); - LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device - << ", MLU Compute Capability: " - << compute_capability_ / 10 << "." - << compute_capability_ % 10 - << ", Driver API Version: " << driver_version_ / 10000 - << "." << (driver_version_ / 100) % 100 << "." - << driver_version_ % 100 << ", Runtime API Version: " - << runtime_version_ / 10000 << "." - << (runtime_version_ / 100) % 100 << "." - << runtime_version_ % 100 - << ", Cnnl API Version: " << cnnl_version_ / 10000 - << "." << (cnnl_version_ / 100) % 100 << "." - << cnnl_version_ % 100; + LOG_FIRST_N(WARNING, 1) + << "Please NOTE: device: " << static_cast(place_.device) + << ", MLU Compute Capability: " << compute_capability_ / 10 << "." + << compute_capability_ % 10 + << ", Driver API Version: " << driver_version_ / 10000 << "." + << (driver_version_ / 100) % 100 << "." << driver_version_ % 100 + << ", Runtime API Version: " << runtime_version_ / 10000 << "." + << (runtime_version_ / 100) % 100 << "." << runtime_version_ % 100 + << ", Cnnl API Version: " << cnnl_version_ / 10000 << "." + << (cnnl_version_ / 100) % 100 << "." << cnnl_version_ % 100; default_ctx_.reset(new MLUContext(place_)); } diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py new file mode 100644 index 0000000000000..3081ee9d38754 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py @@ -0,0 +1,506 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import numpy as np +import unittest +import time +import argparse +import os +import six +import sys + +sys.path.append("..") +import subprocess +import traceback +import functools +import pickle +from contextlib import closing +import paddle.fluid as fluid +import paddle.fluid.unique_name as nameGen +from paddle.fluid import core +from six import string_types +import paddle + +from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator + +from paddle.fluid.tests.unittests.test_sync_batch_norm_op import create_or_get_tensor + +_set_use_system_allocator(False) +paddle.enable_static() + +SEED = 10 + + +class TestSyncBatchNormRunnerBase(object): + + def get_model(self, + main, + startup, + place, + layout, + seed, + sync_bn=False, + only_forward=False): + raise NotImplementedError( + "get model should be implemented by child class.") + + def wait_server_ready(self, endpoints): + assert not isinstance(endpoints, string_types) + while True: + all_ok = True + not_ready_endpoints = [] + for ep in endpoints: + ip_port = ep.split(":") + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as sock: + sock.settimeout(2) + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + if hasattr(socket, 'SO_REUSEPORT'): + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, + 1) + + result = sock.connect_ex((ip_port[0], int(ip_port[1]))) + if result != 0: + all_ok = False + not_ready_endpoints.append(ep) + if not all_ok: + sys.stderr.write("server not ready, wait 3 sec to retry...\n") + sys.stderr.write("not ready endpoints:" + + str(not_ready_endpoints) + "\n") + sys.stderr.flush() + time.sleep(3) + else: + break + + def initCommunicator(self, program, rank, nranks, wait_port, + current_endpoint, endpoints): + other_endpoints = endpoints[:] + other_endpoints.remove(current_endpoint) + if rank == 0 and wait_port: + self.wait_server_ready(other_endpoints) + block = program.global_block() + cncl_id_var = block.create_var(name=nameGen.generate('cncl_id'), + persistable=True, + type=core.VarDesc.VarType.RAW) + block.append_op(type='c_gen_cncl_id', + inputs={}, + outputs={'Out': cncl_id_var}, + attrs={ + 'rank': rank, + 'endpoint': current_endpoint, + 'other_endpoints': other_endpoints + }) + block.append_op(type='c_comm_init', + inputs={'X': cncl_id_var}, + outputs={}, + attrs={ + 'nranks': nranks, + 'rank': rank, + 'ring_id': self.global_ring_id + }) + + def run_trainer(self, args): + device_id = int(os.getenv("FLAGS_selected_mlus", "0")) + place = fluid.MLUPlace(device_id) + places = [place] + + # Test training + for place in places: + for layout in ["NCHW", "NHWC"]: + self._compare(args, place, layout, False) + + # Test inference + for place in places: + for layout in ["NCHW", "NHWC"]: + self._compare(args, place, layout, True) + + # # Test FP16 - @TODO + # self.dtype = np.float16 + # self.atol = 1e-2 + + # # Test training + # for place in places: + # for layout in ["NCHW", "NHWC"]: + # self._compare(args, place, layout, False) + + # # Test inference + # for place in places: + # for layout in ["NCHW", "NHWC"]: + # self._compare(args, place, layout, True) + + sys.stdout.buffer.write( + pickle.dumps( + 'training, inference, fp32, fp16, NCHW, NHWC all passed')) + + def _compare(self, args, place, layout, only_forward): + scope = core.Scope() + + np.random.seed(SEED) + data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2 + sys.stderr.write("data: " + str(data) + "\n") + data = create_or_get_tensor(scope, "input", + OpTest.np_dtype_to_fluid_dtype(data), place) + + bn_fetches = self._cal_single_card(args, data, place, layout, + only_forward) + fetch_names, sync_bn_fetches = self._cal_multiple_cards( + args, data, place, layout, only_forward) + + sys.stderr.write("len(sync_bn_fetches): " + str(len(sync_bn_fetches)) + + "\n") + for i in six.moves.xrange(0, len(sync_bn_fetches)): + sys.stderr.write("i: " + str(i) + "\n") + sys.stderr.write("fetch_names[i]): " + fetch_names[i] + "\n") + + bn_val = bn_fetches[i] + sync_bn_val = sync_bn_fetches[i] + if sync_bn_val.shape != bn_val.shape: + sync_bn_val = sync_bn_val[:bn_val.shape[0]] + + # i = 0 + if fetch_names[i] == 'reduce_sum_0.tmp_0': + # sys.stderr.write("skip reduce_sum_0.tmp_0 (Out of reduce_sum op)" + "\n") + sys.stderr.write("reduce_sum_0.tmp_0 (Out of reduce_sum op)" + + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + # continue + + # i = 1 + if fetch_names[i] == 'conv2d_0.tmp_0': + # sys.stderr.write("skip conv2d_0.tmp_0 (X)" + "\n") + sys.stderr.write("conv2d_0.tmp_0 (X)" + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + # continue + + # i = 2 + if fetch_names[i] == 'batch_norm_0.tmp_3': + # sys.stderr.write("skip batch_norm_0.tmp_3 (Y)" + "\n") + sys.stderr.write("batch_norm_0.tmp_3 (Y)" + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + # continue + + # i = 2 + if fetch_names[i] == 'batch_norm_0.tmp_2': + # sys.stderr.write("skip batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n") + sys.stderr.write( + "batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + # continue + + # i = 3 + if fetch_names[i] == 'bn_moving_mean': + sys.stderr.write("skip bn_moving_mean (MeanOut)" + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + continue + + # i = 4 + if fetch_names[i] == 'bn_moving_variance': + sys.stderr.write("skip bn_moving_variance (VarianceOut)" + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + continue + + # i = 7 + if fetch_names[i] == 'batch_norm_0.tmp_0': + # sys.stderr.write("skip batch_norm_0.tmp_0 (SavedMean)" + "\n") + sys.stderr.write("batch_norm_0.tmp_0 (SavedMean)" + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + # continue + + # i = 8 + if fetch_names[i] == 'batch_norm_0.tmp_1': + sys.stderr.write("skip batch_norm_0.tmp_1 (SavedVariance)" + + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + continue + + # i = 9 + if fetch_names[i] == 'bn_scale@GRAD': + # sys.stderr.write("skip bn_scale@GRAD (Scale@GRAD)" + "\n") + sys.stderr.write("bn_scale@GRAD (Scale@GRAD)" + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + # continue + + # i = 10 + if fetch_names[i] == 'bn_bias@GRAD': + # sys.stderr.write("skip bn_bias@GRAD (Bias@GRAD)" + "\n") + sys.stderr.write("bn_bias@GRAD (Bias@GRAD)" + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + # continue + + # i = 11 + if fetch_names[i] == 'batch_norm_0.tmp_3@GRAD': + # sys.stderr.write("skip batch_norm_0.tmp_3@GRAD (Y@GRAD)" + "\n") + sys.stderr.write("batch_norm_0.tmp_3@GRAD (Y@GRAD)" + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + # continue + + # i = 12 + if fetch_names[i] == 'conv2d_0.tmp_0@GRAD': + # sys.stderr.write("skip conv2d_0.tmp_0@GRAD (X@GRAD)" + "\n") + sys.stderr.write("conv2d_0.tmp_0@GRAD (X@GRAD)" + "\n") + sys.stderr.write("bn_val: " + str(bn_val) + "\n") + sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n") + + # continue + + atol = self.atol + if fetch_names[i] == 'conv2d_0.tmp_0@GRAD': + atol = 1e-2 + + assert np.allclose( + bn_val, sync_bn_val, atol=atol), "Output (" + fetch_names[ + i] + ") has diff. \n" + "\nBN " + str( + bn_val) + "\n" + "Sync BN " + str(sync_bn_val) + + def _cal_single_card(self, args, data, place, layout, only_forward): + # Single-MLU, N = 32 per MLU + train_prog = fluid.Program() + startup_prog = fluid.Program() + train_prog.global_seed(SEED) + startup_prog.global_seed(SEED) + paddle.seed(SEED) + + outs = self.get_model(train_prog, startup_prog, place, layout, SEED, + False, only_forward) + + exe = fluid.Executor(place) + exe.run(startup_prog) + fetch_names = [v.name for v in outs] + [ + 'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias' + ] + if not only_forward: + others = [ + 'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD', + 'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD' + ] + fetch_names += others + bn_fetches = exe.run(program=train_prog, + feed={'input': data}, + fetch_list=fetch_names) + + return bn_fetches + + def _cal_multiple_cards(self, args, data, place, layout, only_forward): + # Multi-MLUs, self.N per MLU + assert core.get_mlu_device_count() > 1 + + train_prog = fluid.Program() + startup_prog = fluid.Program() + train_prog.global_seed(SEED) + startup_prog.global_seed(SEED) + paddle.seed(SEED) + sys.stderr.write("train_prog: " + train_prog.to_string(True) + "\n") + sys.stderr.write("startup_prog: " + startup_prog.to_string(True) + "\n") + + endpoints = args["endpoints"].split(",") + rank = args["trainerid"] + current_endpoint = args["currentendpoint"] + nranks = 2 + + self.initCommunicator(startup_prog, rank, nranks, True, + current_endpoint, endpoints) + sys.stderr.write("after init, startup_prog: " + + startup_prog.to_string(True) + "\n") + train_prog.global_seed(SEED) + train_prog._sync_with_cpp() + startup_prog.global_seed(SEED) + startup_prog._sync_with_cpp() + paddle.seed(SEED) + + self.rank = rank + outs = self.get_model(train_prog, startup_prog, place, layout, SEED, + True, only_forward) + sys.stderr.write("after get_model, train_prog: " + + train_prog.to_string(True) + "\n") + sys.stderr.write("after get_model, startup_prog: " + + startup_prog.to_string(True) + "\n") + + ops = train_prog.blocks[0].ops + for i, op in enumerate(ops): + if op.type == 'batch_norm': + sys.stderr.write("i: " + str(i) + "\n") + sys.stderr.write("op type: " + op.type + "\n") + op.desc.set_type('sync_batch_norm') + if op.type == 'batch_norm_grad': + sys.stderr.write("i: " + str(i) + "\n") + sys.stderr.write("op type: " + op.type + "\n") + op.desc.set_type('sync_batch_norm_grad') + + sys.stderr.write("after update sync_batch_norm, train_prog: " + + train_prog.to_string(True) + "\n") + + exe = fluid.Executor(place) + exe.run(startup_prog) + fetch_names = [v.name for v in outs] + [ + 'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias' + ] + if not only_forward: + others = [ + 'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD', + 'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD' + ] + fetch_names += others + sync_bn_fetches = exe.run(program=train_prog, + feed={'input': data}, + fetch_list=fetch_names) + + return fetch_names, sync_bn_fetches + + +def runtime_main(test_class, col_type, sub_type): + args = {} + model = test_class() + args["deviceid"] = os.getenv("FLAGS_selected_mlus") + args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID")) + args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM")) + args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS') + args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT") + args["col_type"] = col_type + model.run_trainer(args) + + +import paddle.compat as cpt +import socket +from contextlib import closing + + +class TestDistBase(unittest.TestCase): + + def setUp(self): + self._port_set = set() + self._trainers = 2 + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + self._find_free_port(), self._find_free_port()) + self._python_interp = sys.executable + + def _find_free_port(self): + + def __free_port(): + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + return s.getsockname()[1] + + while True: + port = __free_port() + if port not in self._port_set: + self._port_set.add(port) + return port + + def _run_cluster(self, model_file, envs): + worker_endpoints = self._ps_endpoints.split(",") + w0_ep, w1_ep = worker_endpoints + # print("w0_ep:", w0_ep, " w1_ep:", w1_ep) + env0 = { + "FLAGS_selected_mlus": "0", + "PADDLE_TRAINER_ID": "0", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, + "PADDLE_CURRENT_ENDPOINT": w0_ep, + } + + env1 = { + "FLAGS_selected_mlus": "1", + "PADDLE_TRAINER_ID": "1", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, + "PADDLE_CURRENT_ENDPOINT": w1_ep, + } + #update environment + env0.update(envs) + env1.update(envs) + + tr_cmd = "%s %s" + tr0_cmd = tr_cmd % (self._python_interp, model_file) + tr1_cmd = tr_cmd % (self._python_interp, model_file) + tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w") + tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w") + print("tr0_cmd: {}, env: {}\n".format(tr0_cmd, env0)) + print("tr1_cmd: {}, env: {}\n".format(tr1_cmd, env1)) + tr0_proc = subprocess.Popen(tr0_cmd.strip().split(), + stdout=subprocess.PIPE, + stderr=tr0_pipe, + env=env0) + + tr1_proc = subprocess.Popen(tr0_cmd.strip().split(), + stdout=subprocess.PIPE, + stderr=tr1_pipe, + env=env1) + + tr0_out, tr0_err = tr0_proc.communicate() + tr1_out, tr1_err = tr1_proc.communicate() + + sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) + sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + # close trainer file + tr0_pipe.close() + tr1_pipe.close() + with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f: + sys.stderr.write('trainer 0 stderr file: %s\n' % f.read()) + with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f: + sys.stderr.write('trainer 1 stderr file: %s\n' % f.read()) + return pickle.loads(tr0_out), pickle.loads( + tr1_out), tr0_proc.pid, tr1_proc.pid + + def check_with_place(self, + model_file, + col_type, + check_error_log=False, + need_envs={}): + required_envs = { + "FLAGS_fraction_of_gpu_memory_to_use": "0.15", + "FLAGS_eager_delete_tensor_gb": "0.0", + "PATH": os.getenv("PATH"), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "LD_PRELOAD": os.getenv("LD_PRELOAD", ""), + "FLAGS_call_stack_level": "2", + "GLOG_v": "3", + "PADDLE_WITH_GLOO": '0', + "BACKEND": "cncl" + } + required_envs.update(need_envs) + if check_error_log: + required_envs["GLOG_v"] = "3" + required_envs["GLOG_logtostderr"] = "1" + required_envs["GLOO_LOG_LEVEL"] = "TRACE" + tr0_out, tr1_out, pid0, pid1 = self._run_cluster( + model_file, required_envs) + self.assertEqual( + tr0_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed') + self.assertEqual( + tr1_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed') diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py new file mode 100644 index 0000000000000..ac3f686cb8fe2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py @@ -0,0 +1,43 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle +import os +import sys + +sys.path.append("..") +from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator + +from test_sync_batch_norm_base_mlu import TestDistBase + +_set_use_system_allocator(False) +paddle.enable_static() + + +class TestSyncBatchNormOp(TestDistBase): + + def _setup_config(self): + pass + + def test_identity(self, col_type="identity"): + self.check_with_place("sync_batch_norm_op_mlu.py", + col_type, + check_error_log=True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py new file mode 100644 index 0000000000000..955d9a122a292 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_extra.py @@ -0,0 +1,177 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +test for sync bachnorm op. +for both FP32 and FP16 input. +""" + +from __future__ import print_function + +import unittest +import numpy as np +import os +import sys +import six +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +import paddle.nn as nn +from paddle.fluid import Program, program_guard + +from paddle.fluid.tests.unittests.op_test import OpTest, _set_use_system_allocator +from paddle.fluid.tests.unittests.test_dist_base import TestDistBase + +paddle.enable_static() + + +class TestDygraphSyncBatchNormAPIError(unittest.TestCase): + + def test_errors(self): + if not core.is_compiled_with_mlu(): + return + + with program_guard(Program(), Program()): + my_sync_batch_norm = paddle.nn.SyncBatchNorm(10) + x1 = fluid.create_lod_tensor(np.array([-1, 3, 5, 5]), + [[1, 1, 1, 1]], fluid.MLUPlace(0)) + self.assertRaises(TypeError, my_sync_batch_norm, x1) + + # the input dtype of SyncBatchNorm must be float16 or float32 + x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="int32") + self.assertRaises(TypeError, my_sync_batch_norm, x2) + + +class TestConvertSyncBatchNorm(unittest.TestCase): + + def test_convert(self): + if not core.is_compiled_with_mlu(): + return + + with program_guard(Program(), Program()): + compare_model = paddle.nn.Sequential(paddle.nn.Conv2D(3, 5, 3), + paddle.nn.BatchNorm2D(5), + paddle.nn.BatchNorm2D(5)) + model = paddle.nn.Sequential( + paddle.nn.Conv2D(3, 5, 3), paddle.nn.BatchNorm2D(5), + paddle.nn.BatchNorm2D( + 5, + weight_attr=fluid.ParamAttr(name='bn.scale'), + bias_attr=fluid.ParamAttr(name='bn.bias'))) + model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) + for idx, sublayer in enumerate(compare_model.sublayers()): + if isinstance(sublayer, paddle.nn.BatchNorm2D): + self.assertEqual( + isinstance(model[idx], paddle.nn.SyncBatchNorm), True) + + +class TestConvertSyncBatchNormCast1(unittest.TestCase): + + def test_convert(self): + if not core.is_compiled_with_mlu(): + return + + class Net(nn.Layer): + + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2D(3, 5, 3) + self.bn = [] + bn = self.add_sublayer('bn', nn.BatchNorm2D(5)) + self.bn.append(bn) + + def forward(self, x): + x = self.conv1(x) + for bn in self.bn: + x = bn(x) + return x + + model = nn.Sequential() + model.add_sublayer('net1', Net()) + model.add_sublayer('net2', Net()) + compare_model = nn.Sequential() + compare_model.add_sublayer('net1', Net()) + compare_model.add_sublayer('net2', Net()) + model = nn.SyncBatchNorm.convert_sync_batchnorm(model) + self.assertEqual(len(compare_model.sublayers()), len(model.sublayers())) + + +class TestConvertSyncBatchNormCase2(unittest.TestCase): + + def test_convert(self): + if not core.is_compiled_with_mlu(): + return + + with fluid.dygraph.guard(fluid.MLUPlace(0)): + + class SyBNNet(paddle.nn.Layer): + + def __init__(self, in_ch=3, out_ch=3, dirate=1): + super(SyBNNet, self).__init__() + self.bn_s1 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( + paddle.nn.BatchNorm3D( + out_ch, + weight_attr=paddle.ParamAttr( + regularizer=paddle.regularizer.L2Decay(0.)))) + self.bn_s2 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( + paddle.nn.BatchNorm3D(out_ch, data_format='NDHWC')) + + def forward(self, x): + x = self.bn_s1(x) + out = paddle.sum(paddle.abs(self.bn_s2(x))) + return out + + class BNNet(paddle.nn.Layer): + + def __init__(self, in_ch=3, out_ch=3, dirate=1): + super(BNNet, self).__init__() + self.bn_s1 = paddle.nn.BatchNorm3D( + out_ch, + weight_attr=paddle.ParamAttr( + regularizer=paddle.regularizer.L2Decay(0.))) + self.bn_s2 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( + paddle.nn.BatchNorm3D(out_ch, data_format='NDHWC')) + + def forward(self, x): + x = self.bn_s1(x) + out = paddle.sum(paddle.abs(self.bn_s2(x))) + return out + + bn_model = BNNet() + sybn_model = SyBNNet() + np.random.seed(10) + data = np.random.random([3, 3, 3, 3, 3]).astype('float32') + x = paddle.to_tensor(data) + bn_out = bn_model(x) + sybn_out = sybn_model(x) + self.assertTrue( + np.allclose(bn_out.numpy(), sybn_out.numpy()), + "Output has diff. \n" + "\nBN " + str(bn_out.numpy()) + + "\n" + "Sync BN " + str(sybn_out.numpy())) + + +class TestDygraphSyncBatchNormDataFormatError(unittest.TestCase): + + def test_errors(self): + if not core.is_compiled_with_mlu(): + return + + with fluid.dygraph.guard(fluid.MLUPlace(0)): + my_sync_batch_norm = paddle.nn.SyncBatchNorm(10, data_format='CN') + data = np.random.random([3, 3, 3]).astype('float32') + x = paddle.to_tensor(data) + self.assertRaises(ValueError, my_sync_batch_norm, x) + + +if __name__ == '__main__': + unittest.main() From 682acd2224c415afdd2d3353917ca4712963b7e8 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Tue, 12 Jul 2022 14:39:12 +0800 Subject: [PATCH 141/250] [Sparse]add sparse unary api(sin/tan/pow/neg/log1p/square/cast...) (#44022) --- paddle/fluid/pybind/eager_method.cc | 25 ++ .../api/yaml/generator/sparse_bw_api_gen.py | 1 + paddle/phi/api/yaml/sparse_api.yaml | 168 ++++++- paddle/phi/api/yaml/sparse_bw_api.yaml | 129 +++++- paddle/phi/kernels/activation_grad_kernel.h | 8 + paddle/phi/kernels/activation_kernel.h | 27 +- .../phi/kernels/funcs/eigen/eigen_function.h | 12 + paddle/phi/kernels/funcs/eigen/elementwise.cc | 17 + paddle/phi/kernels/funcs/eigen/elementwise.cu | 17 + .../kernels/sparse/cpu/unary_grad_kernel.cc | 79 ++++ paddle/phi/kernels/sparse/cpu/unary_kernel.cc | 139 ++++++ .../kernels/sparse/gpu/unary_grad_kernel.cu | 79 ++++ paddle/phi/kernels/sparse/gpu/unary_kernel.cu | 142 ++++++ .../sparse/impl/unary_grad_kernel_impl.h | 141 ++++++ .../kernels/sparse/impl/unary_kernel_impl.h | 207 +++++++++ .../phi/kernels/sparse/unary_grad_kernel.cc | 183 -------- paddle/phi/kernels/sparse/unary_grad_kernel.h | 68 ++- paddle/phi/kernels/sparse/unary_kernel.cc | 177 -------- paddle/phi/kernels/sparse/unary_kernel.h | 95 +++- .../kernels/test_sparse_activation_dev_api.cc | 4 +- .../unittests/test_sparse_elementwise_op.py | 10 +- .../tests/unittests/test_sparse_model.py | 4 + .../tests/unittests/test_sparse_unary_op.py | 237 +++++----- .../tests/unittests/test_sparse_utils_op.py | 4 +- python/paddle/incubate/sparse/__init__.py | 37 +- python/paddle/incubate/sparse/binary.py | 199 ++++++++- python/paddle/incubate/sparse/math.py | 260 ----------- python/paddle/incubate/sparse/unary.py | 413 ++++++++++++++++-- 28 files changed, 2036 insertions(+), 846 deletions(-) create mode 100644 paddle/phi/kernels/sparse/cpu/unary_grad_kernel.cc create mode 100644 paddle/phi/kernels/sparse/cpu/unary_kernel.cc create mode 100644 paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu create mode 100644 paddle/phi/kernels/sparse/gpu/unary_kernel.cu create mode 100644 paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/sparse/impl/unary_kernel_impl.h delete mode 100644 paddle/phi/kernels/sparse/unary_grad_kernel.cc delete mode 100644 paddle/phi/kernels/sparse/unary_kernel.cc delete mode 100644 python/paddle/incubate/sparse/math.py diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 77e196291143c..086c15dafdf22 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -1473,6 +1473,27 @@ static PyObject* tensor_method_get_map_tensor(TensorObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_method_get_non_zero_nums(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + PADDLE_ENFORCE( + self->tensor.is_sparse_coo_tensor() || + self->tensor.is_sparse_csr_tensor(), + paddle::platform::errors::Fatal("this method is only effective for " + "SparseCooTensor or SparseCsrTensor")); + if (self->tensor.is_sparse_coo_tensor()) { + auto sparse_coo_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + return ToPyObject(sparse_coo_tensor->nnz()); + } else { + auto sparse_csr_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + return ToPyObject(sparse_csr_tensor->nnz()); + } + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* tensor_method_get_non_zero_indices(TensorObject* self, PyObject* args, PyObject* kwargs) { @@ -1962,6 +1983,10 @@ PyMethodDef variable_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, /***the method of sparse tensor****/ + {"nnz", + (PyCFunction)(void (*)(void))tensor_method_get_non_zero_nums, + METH_VARARGS | METH_KEYWORDS, + NULL}, {"indices", (PyCFunction)(void (*)(void))tensor_method_get_non_zero_indices, METH_VARARGS | METH_KEYWORDS, diff --git a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py index e30c5e3c5d05c..f3172a23cb991 100644 --- a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py +++ b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py @@ -109,6 +109,7 @@ def source_include(header_file_path): #include "glog/logging.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/sparse_api_custom_impl.h" diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index 68c41d50ae5ff..d8c275ff1f2e6 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -1,12 +1,85 @@ +- api : abs + args : (Tensor x) + output : Tensor(out) + kernel : + func : abs_coo{sparse_coo -> sparse_coo}, + abs_csr{sparse_csr -> sparse_csr} + layout : x + backward : abs_grad + +- api : acos + args : (Tensor x) + output : Tensor(out) + kernel : + func : acos_coo{sparse_coo -> sparse_coo}, + acos_csr{sparse_csr -> sparse_csr} + layout : x + backward : acos_grad + +- api : acosh + args : (Tensor x) + output : Tensor(out) + kernel : + func : acosh_coo{sparse_coo -> sparse_coo}, + acosh_csr{sparse_csr -> sparse_csr} + layout : x + backward : acosh_grad + - api : add args : (Tensor x, Tensor y) output : Tensor(out) kernel : - func : add_coo_coo{sparse_coo -> sparse_coo}, - add_csr_csr{sparse_csr -> sparse_csr} + func : add_coo_coo{sparse_coo, sparse_coo -> sparse_coo}, + add_csr_csr{sparse_csr, sparse_csr -> sparse_csr} layout : x backward : add_grad +- api : asin + args : (Tensor x) + output : Tensor(out) + kernel : + func : asin_coo{sparse_coo -> sparse_coo}, + asin_csr{sparse_csr -> sparse_csr} + layout : x + backward : asin_grad + +- api : asinh + args : (Tensor x) + output : Tensor(out) + kernel : + func : asinh_coo{sparse_coo -> sparse_coo}, + asinh_csr{sparse_csr -> sparse_csr} + layout : x + backward : asinh_grad + +- api : atan + args : (Tensor x) + output : Tensor(out) + kernel : + func : atan_coo{sparse_coo -> sparse_coo}, + atan_csr{sparse_csr -> sparse_csr} + layout : x + backward : atan_grad + +- api : atanh + args : (Tensor x) + output : Tensor(out) + kernel : + func : atanh_coo{sparse_coo -> sparse_coo}, + atanh_csr{sparse_csr -> sparse_csr} + layout : x + backward : atanh_grad + +- api : cast + args : (Tensor x, DataType index_dtype=DataType::UNDEFINED, DataType value_dtype=DataType::UNDEFINED) + output : Tensor(out) + kernel : + func : cast_coo{sparse_coo -> sparse_coo}, + cast_csr{sparse_csr -> sparse_csr} + layout : x + data_type : x + backward : cast_grad + - api : conv3d args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) output : Tensor(out), Tensor(rulebook) @@ -41,38 +114,81 @@ args : (Tensor x, Tensor y) output : Tensor(out) kernel : - func : divide_coo_coo{sparse_coo -> sparse_coo}, - divide_csr_csr{sparse_csr -> sparse_csr} + func : divide_coo_coo{sparse_coo, sparse_coo -> sparse_coo}, + divide_csr_csr{sparse_csr, sparse_csr -> sparse_csr} layout : x backward : divide_grad +- api : divide_scalar + args : (Tensor x, float scalar) + output : Tensor(out) + kernel : + func : divide_coo_scalar{sparse_coo -> sparse_coo}, + divide_csr_scalar{sparse_csr -> sparse_csr} + backward : divide_scalar_grad + +- api : log1p + args : (Tensor x) + output : Tensor(out) + kernel : + func : log1p_coo{sparse_coo -> sparse_coo}, + log1p_csr{sparse_csr -> sparse_csr} + layout : x + backward : log1p_grad + - api : multiply args : (Tensor x, Tensor y) output : Tensor(out) kernel : - func : multiply_coo_coo{sparse_coo -> sparse_coo}, - multiply_csr_csr{sparse_csr -> sparse_csr} + func : multiply_coo_coo{sparse_coo, sparse_coo -> sparse_coo}, + multiply_csr_csr{sparse_csr, sparse_csr -> sparse_csr} layout : x backward : multiply_grad +- api : pow + args : (Tensor x, float factor) + output : Tensor(out) + kernel : + func : pow_coo{sparse_coo -> sparse_coo}, + pow_csr{sparse_csr -> sparse_csr} + layout : x + backward : pow_grad + - api : relu args : (Tensor x) output : Tensor(out) kernel : - func : sparse_coo_relu{sparse_coo -> sparse_coo}, - sparse_csr_relu{sparse_csr -> sparse_csr} + func : relu_coo{sparse_coo -> sparse_coo}, + relu_csr{sparse_csr -> sparse_csr} layout : x backward : relu_grad +- api : scale + args : (Tensor x, float scale, float bias, bool bias_after_scale) + output : Tensor(out) + kernel : + func : scale_coo{sparse_coo -> sparse_coo}, + scale_csr{sparse_csr -> sparse_csr} + backward : scale_grad + - api : sin args : (Tensor x) - output : Tensor(out@SparseCooTensor) + output : Tensor(out) kernel : - func : sparse_coo_sin {sparse_coo -> sparse_coo}, - sparse_csr_sin {sparse_csr -> sparse_csr} + func : sin_coo{sparse_coo -> sparse_coo}, + sin_csr{sparse_csr -> sparse_csr} layout : x backward : sin_grad +- api : sinh + args : (Tensor x) + output : Tensor(out) + kernel : + func : sinh_coo{sparse_coo -> sparse_coo}, + sinh_csr{sparse_csr -> sparse_csr} + layout : x + backward : sinh_grad + - api : softmax args : (Tensor x, int axis=-1) output : Tensor(out) @@ -85,26 +201,44 @@ args : (Tensor x) output : Tensor(out) kernel : - func : sparse_coo_sqrt{sparse_coo -> sparse_coo}, - sparse_csr_sqrt{sparse_csr -> sparse_csr} + func : sqrt_coo{sparse_coo -> sparse_coo}, + sqrt_csr{sparse_csr -> sparse_csr} layout : x backward : sqrt_grad +- api : square + args : (Tensor x) + output : Tensor(out) + kernel : + func : square_coo{sparse_coo -> sparse_coo}, + square_csr{sparse_csr -> sparse_csr} + layout : x + backward : square_grad + - api : subtract args : (Tensor x, Tensor y) output : Tensor(out) kernel : - func : subtract_coo_coo{sparse_coo -> sparse_coo}, - subtract_csr_csr{sparse_csr -> sparse_csr} + func : subtract_coo_coo{sparse_coo, sparse_coo -> sparse_coo}, + subtract_csr_csr{sparse_csr, sparse_csr -> sparse_csr} layout : x backward : subtract_grad +- api : tan + args : (Tensor x) + output : Tensor(out) + kernel : + func : tan_coo{sparse_coo -> sparse_coo}, + tan_csr{sparse_csr -> sparse_csr} + layout : x + backward : tan_grad + - api : tanh args : (Tensor x) output : Tensor(out) kernel : - func : sparse_coo_tanh{sparse_coo -> sparse_coo}, - sparse_csr_tanh{sparse_csr -> sparse_csr} + func : tanh_coo{sparse_coo -> sparse_coo}, + tanh_csr{sparse_csr -> sparse_csr} layout : x backward : tanh_grad diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml index 0ca9c9daa9a5a..220d45cadcb06 100644 --- a/paddle/phi/api/yaml/sparse_bw_api.yaml +++ b/paddle/phi/api/yaml/sparse_bw_api.yaml @@ -1,3 +1,27 @@ +- backward_api : abs_grad + forward : tanh(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : abs_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + abs_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + +- backward_api : acos_grad + forward : acos(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : acos_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + acos_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + +- backward_api : acosh_grad + forward : acosh(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : acosh_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + acosh_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + - backward_api : add_grad forward : add(Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad) @@ -6,6 +30,47 @@ func : add_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}, add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr} +- backward_api : asin_grad + forward : asin(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : asin_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + asin_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + +- backward_api : asinh_grad + forward : asinh(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : asinh_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + asinh_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + +- backward_api : atan_grad + forward : atan(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : atan_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + atan_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + +- backward_api : atanh_grad + forward : atanh(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : atanh_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + atanh_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + +- backward_api : cast_grad + forward : cast(Tensor x, DataType index_dtype, DataType value_dtype) -> Tensor(out) + args : (Tensor x, Tensor out_grad, DataType value_dtype) + output : Tensor(x_grad) + kernel : + func : cast_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + cast_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + data_type : out_grad + - backward_api : conv3d_grad forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor) args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) @@ -41,6 +106,20 @@ func : divide_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}, divide_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr} +- backward_api : divide_scalar_grad + forward : divide_scalar (Tensor x, float scalar) -> Tensor(out) + args : (Tensor out_grad, float scalar) + output : Tensor(x_grad) + invoke : divide_scalar(out_grad, scalar) + +- backward_api : log1p_grad + forward : log1p(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : log1p_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + log1p_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + - backward_api : masked_matmul_grad forward : masked_matmul(Tensor x, Tensor y, Tensor mask) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad) @@ -71,19 +150,43 @@ func : mv_coo_grad{sparse_coo, dense, dense -> sparse_coo, dense}, mv_csr_grad{sparse_csr, dense, dense -> sparse_csr, dense} +- backward_api : pow_grad + forward : pow(Tensor x, float factor) -> Tensor(out) + args : (Tensor x, Tensor out_grad, float factor) + output : Tensor(x_grad) + kernel : + func : pow_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + pow_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + - backward_api : relu_grad forward : relu(Tensor x) -> Tensor(out) args : (Tensor out, Tensor out_grad) output : Tensor(x_grad) kernel : - func : sparse_coo_relu_grad {sparse_coo, sparse_coo -> sparse_coo} + func : relu_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + relu_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + +- backward_api : scale_grad + forward : scale(Tensor x, float scale, float bias, bool bias_after_scale) -> Tensor(out) + args : (Tensor out_grad, float scale) + output : Tensor(x_grad) + invoke : scale(out_grad, scale, 0.0, true) - backward_api : sin_grad forward : sin(Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) output : Tensor(x_grad) kernel : - func : sparse_coo_sin_grad {sparse_coo, sparse_coo -> sparse_coo} + func : sin_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + sin_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + +- backward_api : sinh_grad + forward : sinh(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : sinh_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + sinh_csr_grad {sparse_csr, sparse_csr -> sparse_csr} - backward_api : softmax_grad forward : softmax(Tensor x, int axis=-1) -> Tensor(out) @@ -104,7 +207,16 @@ args : (Tensor out, Tensor out_grad) output : Tensor(x_grad) kernel : - func : sparse_coo_sqrt_grad {sparse_coo, sparse_coo -> sparse_coo} + func : sqrt_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + sqrt_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + +- backward_api : square_grad + forward : square(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : square_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + square_csr_grad {sparse_csr, sparse_csr -> sparse_csr} - backward_api : subtract_grad forward : subtract(Tensor x, Tensor y) -> Tensor(out) @@ -114,12 +226,21 @@ func : subtract_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}, subtract_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr} +- backward_api : tan_grad + forward : tan(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + kernel : + func : tan_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + tan_csr_grad {sparse_csr, sparse_csr -> sparse_csr} + - backward_api : tanh_grad forward : tanh(Tensor x) -> Tensor(out) args : (Tensor out, Tensor out_grad) output : Tensor(x_grad) kernel : - func : sparse_coo_tanh_grad {sparse_coo, sparse_coo -> sparse_coo} + func : tanh_coo_grad {sparse_coo, sparse_coo -> sparse_coo}, + tanh_csr_grad {sparse_csr, sparse_csr -> sparse_csr} - backward_api : values_grad forward : coo_values(Tensor x) -> Tensor(out) diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h index 8e63a0fd22ade..4daa231437116 100644 --- a/paddle/phi/kernels/activation_grad_kernel.h +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -212,12 +212,17 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acosh); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Square); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log10); DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal); +DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt); DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu); DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh); DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid); @@ -233,9 +238,12 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, beta); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps); +DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, threshold); DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, alpha); DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(BRelu, t_min, t_max); +DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b); +DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus, beta, threshold); DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid, slope, offset); diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h index 5cc4357c937db..8e5913e10fdb7 100644 --- a/paddle/phi/kernels/activation_kernel.h +++ b/paddle/phi/kernels/activation_kernel.h @@ -40,12 +40,12 @@ namespace phi { float attr2, \ DenseTensor* out); +DECLARE_ACTIVATION_KERNEL(Sin) DECLARE_ACTIVATION_KERNEL(Cos) DECLARE_ACTIVATION_KERNEL(Tan) -DECLARE_ACTIVATION_KERNEL(Acos) -DECLARE_ACTIVATION_KERNEL(Sin) DECLARE_ACTIVATION_KERNEL(Asin) DECLARE_ACTIVATION_KERNEL(Atan) +DECLARE_ACTIVATION_KERNEL(Acos) DECLARE_ACTIVATION_KERNEL(Sinh) DECLARE_ACTIVATION_KERNEL(Cosh) DECLARE_ACTIVATION_KERNEL(Asinh) @@ -53,15 +53,14 @@ DECLARE_ACTIVATION_KERNEL(Acosh) DECLARE_ACTIVATION_KERNEL(Atanh) DECLARE_ACTIVATION_KERNEL(Relu) DECLARE_ACTIVATION_KERNEL(Tanh) +DECLARE_ACTIVATION_KERNEL(TanhShrink) +DECLARE_ACTIVATION_KERNEL(Silu) DECLARE_ACTIVATION_KERNEL(Exp) DECLARE_ACTIVATION_KERNEL(Expm1) DECLARE_ACTIVATION_KERNEL(Reciprocal) DECLARE_ACTIVATION_KERNEL(Square) DECLARE_ACTIVATION_KERNEL(Sqrt) DECLARE_ACTIVATION_KERNEL(Rsqrt) - -DECLARE_ACTIVATION_KERNEL(TanhShrink) -DECLARE_ACTIVATION_KERNEL(Silu) DECLARE_ACTIVATION_KERNEL(Sigmoid) DECLARE_ACTIVATION_KERNEL(LogSigmoid) DECLARE_ACTIVATION_KERNEL(Log) @@ -77,28 +76,18 @@ DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Relu6, threshold) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Mish, threshold) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(HardShrink, threshold) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(SoftShrink, lambda) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Elu, alpha) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Swish, beta) DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Celu, alpha) +DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(Logit, eps) DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(BRelu, t_min, t_max) DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(STanh, scale_a, scale_b) -DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset) - DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(Softplus, beta, threshold) - -template -void LogitKernel(const Context& dev_ctx, - const DenseTensor& x, - float eps, - DenseTensor* out); - -template -void MishKernel(const Context& dev_ctx, - const DenseTensor& x, - float threshold, - DenseTensor* out); +DECLARE_ACTIVATION_KERNEL_WITH_TWO_ATTRS(HardSigmoid, slope, offset) template void HardSwishKernel(const Context& dev_ctx, diff --git a/paddle/phi/kernels/funcs/eigen/eigen_function.h b/paddle/phi/kernels/funcs/eigen/eigen_function.h index b971b4f95ef57..1e81256e79e14 100644 --- a/paddle/phi/kernels/funcs/eigen/eigen_function.h +++ b/paddle/phi/kernels/funcs/eigen/eigen_function.h @@ -118,6 +118,18 @@ struct EigenSub { const InType& right); }; +template +struct EigenDiv { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, + OutType out, + const InType& in, + const T value); +}; + template struct EigenSlice { using Array = Eigen::DSizes; diff --git a/paddle/phi/kernels/funcs/eigen/elementwise.cc b/paddle/phi/kernels/funcs/eigen/elementwise.cc index 507a0116c3c20..713513757ad8c 100644 --- a/paddle/phi/kernels/funcs/eigen/elementwise.cc +++ b/paddle/phi/kernels/funcs/eigen/elementwise.cc @@ -55,5 +55,22 @@ struct EigenSub { template struct EigenSub; +template +struct EigenDiv { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, + OutType out, + const InType& in, + const T value) { + out.device(dev) = in / value; + } +}; + +template struct EigenDiv; +template struct EigenDiv; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/eigen/elementwise.cu b/paddle/phi/kernels/funcs/eigen/elementwise.cu index 3855ba8ccf945..1fb3b8a376efa 100644 --- a/paddle/phi/kernels/funcs/eigen/elementwise.cu +++ b/paddle/phi/kernels/funcs/eigen/elementwise.cu @@ -55,5 +55,22 @@ struct EigenSub { template struct EigenSub; +template +struct EigenDiv { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, + OutType out, + const InType& in, + const T value) { + out.device(dev) = in / value; + } +}; + +template struct EigenDiv; +template struct EigenDiv; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/unary_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/unary_grad_kernel.cc new file mode 100644 index 0000000000000..f8520db2cad6f --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/unary_grad_kernel.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sparse/unary_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h" + +#define PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(name, prefix) \ + PD_REGISTER_KERNEL(name##_coo_grad, \ + CPU, \ + ALL_LAYOUT, \ + phi::sparse::prefix##CooGradKernel, \ + float, \ + double) { \ + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ + } \ + \ + PD_REGISTER_KERNEL(name##_csr_grad, \ + CPU, \ + ALL_LAYOUT, \ + phi::sparse::prefix##CsrGradKernel, \ + float, \ + double) { \ + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ + } + +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(sin, Sin) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(tan, Tan) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(asin, Asin) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(atan, Atan) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(sinh, Sinh) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(tanh, Tanh) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(asinh, Asinh) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(atanh, Atanh) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(sqrt, Sqrt) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(square, Square) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(log1p, Log1p) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(relu, Relu) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(abs, Abs) +PD_REGISTER_SPARSE_UNARY_CPU_GRAD_KERNEL(pow, Pow) + +PD_REGISTER_KERNEL(cast_coo_grad, + CPU, + ALL_LAYOUT, + phi::sparse::CastCooGradKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool) {} + +PD_REGISTER_KERNEL(cast_csr_grad, + CPU, + ALL_LAYOUT, + phi::sparse::CastCsrGradKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool) {} diff --git a/paddle/phi/kernels/sparse/cpu/unary_kernel.cc b/paddle/phi/kernels/sparse/cpu/unary_kernel.cc new file mode 100644 index 0000000000000..1c1ece27d97d0 --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/unary_kernel.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sparse/unary_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h" +#include "paddle/phi/kernels/sparse/impl/unary_kernel_impl.h" + +namespace phi { +namespace sparse { + +template +void DivCooScalarKernel(const Context& dev_ctx, + const SparseCooTensor& x, + float scalar, + SparseCooTensor* out) { + EmptyLikeCooKernel(dev_ctx, x, out); + + auto eigen_out = + phi::EigenVector::Flatten(*(out->mutable_non_zero_elements())); + auto eigen_x = phi::EigenVector::Flatten(x.non_zero_elements()); + auto& dev = *dev_ctx.eigen_device(); + + phi::funcs::EigenDiv, T>::Eval( + dev, eigen_out, eigen_x, static_cast(scalar)); +} + +template +void DivCsrScalarKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + float scalar, + SparseCsrTensor* out) { + EmptyLikeCsrKernel(dev_ctx, x, out); + + auto eigen_out = + phi::EigenVector::Flatten(*(out->mutable_non_zero_elements())); + auto eigen_x = phi::EigenVector::Flatten(x.non_zero_elements()); + auto& dev = *dev_ctx.eigen_device(); + + phi::funcs::EigenDiv, T>::Eval( + dev, eigen_out, eigen_x, static_cast(scalar)); +} + +} // namespace sparse +} // namespace phi + +#define PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(name, prefix) \ + PD_REGISTER_KERNEL(name##_coo, \ + CPU, \ + ALL_LAYOUT, \ + phi::sparse::prefix##CooKernel, \ + float, \ + double) { \ + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ + } \ + \ + PD_REGISTER_KERNEL(name##_csr, \ + CPU, \ + ALL_LAYOUT, \ + phi::sparse::prefix##CsrKernel, \ + float, \ + double) { \ + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ + } + +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(sin, Sin) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(tan, Tan) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(asin, Asin) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(atan, Atan) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(sinh, Sinh) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(tanh, Tanh) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(asinh, Asinh) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(atanh, Atanh) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(sqrt, Sqrt) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(square, Square) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(log1p, Log1p) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(relu, Relu) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(abs, Abs) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(pow, Pow) +PD_REGISTER_SPARSE_UNARY_CPU_KERNEL(scale, Scale) + +PD_REGISTER_KERNEL(divide_coo_scalar, + CPU, + ALL_LAYOUT, + phi::sparse::DivCooScalarKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} + +PD_REGISTER_KERNEL(divide_csr_scalar, + CPU, + ALL_LAYOUT, + phi::sparse::DivCsrScalarKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); +} + +PD_REGISTER_KERNEL(cast_coo, + CPU, + ALL_LAYOUT, + phi::sparse::CastCooKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool) {} + +PD_REGISTER_KERNEL(cast_csr, + CPU, + ALL_LAYOUT, + phi::sparse::CastCsrKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool) {} diff --git a/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu new file mode 100644 index 0000000000000..c1f2b2a1f0d1d --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sparse/unary_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h" + +#define PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(name, prefix) \ + PD_REGISTER_KERNEL(name##_coo_grad, \ + GPU, \ + ALL_LAYOUT, \ + phi::sparse::prefix##CooGradKernel, \ + float, \ + double) { \ + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ + } \ + \ + PD_REGISTER_KERNEL(name##_csr_grad, \ + GPU, \ + ALL_LAYOUT, \ + phi::sparse::prefix##CsrGradKernel, \ + float, \ + double) { \ + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ + } + +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(sin, Sin) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(tan, Tan) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(asin, Asin) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(atan, Atan) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(sinh, Sinh) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(tanh, Tanh) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(asinh, Asinh) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(atanh, Atanh) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(sqrt, Sqrt) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(square, Square) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(log1p, Log1p) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(relu, Relu) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(abs, Abs) +PD_REGISTER_SPARSE_UNARY_GPU_GRAD_KERNEL(pow, Pow) + +PD_REGISTER_KERNEL(cast_coo_grad, + GPU, + ALL_LAYOUT, + phi::sparse::CastCooGradKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool) {} + +PD_REGISTER_KERNEL(cast_csr_grad, + GPU, + ALL_LAYOUT, + phi::sparse::CastCsrGradKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool) {} diff --git a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu new file mode 100644 index 0000000000000..fdf0b5106d3cf --- /dev/null +++ b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu @@ -0,0 +1,142 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/sparse/unary_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/sparse/impl/unary_kernel_impl.h" + +namespace phi { +namespace sparse { + +template +struct DivScalarFunctor { + T value_; + + explicit DivScalarFunctor(T value) : value_(value) {} + + __device__ __forceinline__ T operator()(const T x) const { + return x / value_; + } +}; + +template +void DivCooScalarKernel(const Context& dev_ctx, + const SparseCooTensor& x, + float scalar, + SparseCooTensor* out) { + EmptyLikeCooKernel(dev_ctx, x, out); + + std::vector ins = {&(x.non_zero_elements())}; + std::vector outs = {out->mutable_non_zero_elements()}; + DivScalarFunctor func(static_cast(scalar)); + funcs::ElementwiseKernel>(dev_ctx, ins, &outs, func); +} + +template +void DivCsrScalarKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + float scalar, + SparseCsrTensor* out) { + EmptyLikeCsrKernel(dev_ctx, x, out); + + std::vector ins = {&(x.non_zero_elements())}; + std::vector outs = {out->mutable_non_zero_elements()}; + DivScalarFunctor func(static_cast(scalar)); + funcs::ElementwiseKernel>(dev_ctx, ins, &outs, func); +} + +} // namespace sparse +} // namespace phi + +#define PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(name, prefix) \ + PD_REGISTER_KERNEL(name##_coo, \ + GPU, \ + ALL_LAYOUT, \ + phi::sparse::prefix##CooKernel, \ + float, \ + double) { \ + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ + } \ + \ + PD_REGISTER_KERNEL(name##_csr, \ + GPU, \ + ALL_LAYOUT, \ + phi::sparse::prefix##CsrKernel, \ + float, \ + double) { \ + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ + } + +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(sin, Sin) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(tan, Tan) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(asin, Asin) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(atan, Atan) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(sinh, Sinh) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(tanh, Tanh) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(asinh, Asinh) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(atanh, Atanh) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(sqrt, Sqrt) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(square, Square) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(log1p, Log1p) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(relu, Relu) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(abs, Abs) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(pow, Pow) +PD_REGISTER_SPARSE_UNARY_GPU_KERNEL(scale, Scale) + +PD_REGISTER_KERNEL(divide_coo_scalar, + GPU, + ALL_LAYOUT, + phi::sparse::DivCooScalarKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} + +PD_REGISTER_KERNEL(divide_csr_scalar, + GPU, + ALL_LAYOUT, + phi::sparse::DivCsrScalarKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); +} + +PD_REGISTER_KERNEL(cast_coo, + GPU, + ALL_LAYOUT, + phi::sparse::CastCooKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool) {} + +PD_REGISTER_KERNEL(cast_csr, + GPU, + ALL_LAYOUT, + phi::sparse::CastCsrKernel, + float, + double, + int8_t, + uint8_t, + int16_t, + int, + int64_t, + bool) {} diff --git a/paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h b/paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h new file mode 100644 index 0000000000000..ffc5f6bbacae3 --- /dev/null +++ b/paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h @@ -0,0 +1,141 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/abs_grad_kernel.h" +#include "paddle/phi/kernels/activation_grad_kernel.h" +#include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/sparse/empty_kernel.h" +#include "paddle/phi/kernels/sparse/impl/unary_kernel_impl.h" + +namespace phi { +namespace sparse { + +#define DEFINE_SPARSE_UNARY_GRAD_KERNEL(prefix) \ + template \ + void prefix##CooGradKernel(const Context& dev_ctx, \ + const SparseCooTensor& x_or_out, \ + const SparseCooTensor& dout, \ + SparseCooTensor* dx) { \ + EmptyLikeCooKernel(dev_ctx, x_or_out, dx); \ + phi::prefix##GradKernel(dev_ctx, \ + x_or_out.non_zero_elements(), \ + dout.non_zero_elements(), \ + dx->mutable_non_zero_elements()); \ + } \ + \ + template \ + void prefix##CsrGradKernel(const Context& dev_ctx, \ + const SparseCsrTensor& x_or_out, \ + const SparseCsrTensor& dout, \ + SparseCsrTensor* dx) { \ + EmptyLikeCsrKernel(dev_ctx, x_or_out, dx); \ + phi::prefix##GradKernel(dev_ctx, \ + x_or_out.non_zero_elements(), \ + dout.non_zero_elements(), \ + dx->mutable_non_zero_elements()); \ + } + +#define DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(prefix, attr) \ + template \ + void prefix##CooGradKernel(const Context& dev_ctx, \ + const SparseCooTensor& x_or_out, \ + const SparseCooTensor& dout, \ + float attr, \ + SparseCooTensor* dx) { \ + EmptyLikeCooKernel(dev_ctx, x_or_out, dx); \ + phi::prefix##GradKernel(dev_ctx, \ + x_or_out.non_zero_elements(), \ + dout.non_zero_elements(), \ + attr, \ + dx->mutable_non_zero_elements()); \ + } \ + \ + template \ + void prefix##CsrGradKernel(const Context& dev_ctx, \ + const SparseCsrTensor& x_or_out, \ + const SparseCsrTensor& dout, \ + float attr, \ + SparseCsrTensor* dx) { \ + EmptyLikeCsrKernel(dev_ctx, x_or_out, dx); \ + phi::prefix##GradKernel(dev_ctx, \ + x_or_out.non_zero_elements(), \ + dout.non_zero_elements(), \ + attr, \ + dx->mutable_non_zero_elements()); \ + } + +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Sin) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Tan) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Asin) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Atan) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Sinh) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Tanh) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Asinh) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Atanh) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Sqrt) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Square) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Log1p) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Relu) +DEFINE_SPARSE_UNARY_GRAD_KERNEL(Abs) +DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor) + +template +void CastCooGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const SparseCooTensor& dout, + DataType value_dtype, + SparseCooTensor* dx) { + EmptyLikeCooKernel(dev_ctx, x, dx); + if (value_dtype == DataType::UNDEFINED) { + phi::Copy(dev_ctx, + dout.non_zero_elements(), + dev_ctx.GetPlace(), + false, + dx->mutable_non_zero_elements()); + } else { + phi::CastKernel(dev_ctx, + dout.non_zero_elements(), + x.non_zero_elements().dtype(), + dx->mutable_non_zero_elements()); + } +} + +template +void CastCsrGradKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + const SparseCsrTensor& dout, + DataType value_dtype, + SparseCsrTensor* dx) { + EmptyLikeCsrKernel(dev_ctx, x, dx); + if (value_dtype == DataType::UNDEFINED) { + phi::Copy(dev_ctx, + dout.non_zero_elements(), + dev_ctx.GetPlace(), + false, + dx->mutable_non_zero_elements()); + } else { + phi::CastKernel(dev_ctx, + dout.non_zero_elements(), + x.non_zero_elements().dtype(), + dx->mutable_non_zero_elements()); + } +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h new file mode 100644 index 0000000000000..231fc551f4788 --- /dev/null +++ b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h @@ -0,0 +1,207 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/core/sparse_csr_tensor.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/core/visit_type.h" +#include "paddle/phi/kernels/abs_kernel.h" +#include "paddle/phi/kernels/activation_kernel.h" +#include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/scale_kernel.h" +#include "paddle/phi/kernels/sparse/empty_kernel.h" +#include "paddle/phi/kernels/trunc_kernel.h" + +namespace phi { +namespace sparse { + +#define DEFINE_SPARSE_UNARY_KERNEL(prefix) \ + template \ + void prefix##CooKernel(const Context& dev_ctx, \ + const SparseCooTensor& x, \ + SparseCooTensor* out) { \ + EmptyLikeCooKernel(dev_ctx, x, out); \ + phi::prefix##Kernel( \ + dev_ctx, x.non_zero_elements(), out->mutable_non_zero_elements()); \ + } \ + \ + template \ + void prefix##CsrKernel(const Context& dev_ctx, \ + const SparseCsrTensor& x, \ + SparseCsrTensor* out) { \ + EmptyLikeCsrKernel(dev_ctx, x, out); \ + phi::prefix##Kernel( \ + dev_ctx, x.non_zero_elements(), out->mutable_non_zero_elements()); \ + } + +#define DEFINE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(prefix, attr) \ + template \ + void prefix##CooKernel(const Context& dev_ctx, \ + const SparseCooTensor& x, \ + float attr, \ + SparseCooTensor* out) { \ + EmptyLikeCooKernel(dev_ctx, x, out); \ + phi::prefix##Kernel(dev_ctx, \ + x.non_zero_elements(), \ + attr, \ + out->mutable_non_zero_elements()); \ + } \ + \ + template \ + void prefix##CsrKernel(const Context& dev_ctx, \ + const SparseCsrTensor& x, \ + float attr, \ + SparseCsrTensor* out) { \ + EmptyLikeCsrKernel(dev_ctx, x, out); \ + phi::prefix##Kernel(dev_ctx, \ + x.non_zero_elements(), \ + attr, \ + out->mutable_non_zero_elements()); \ + } + +DEFINE_SPARSE_UNARY_KERNEL(Sin) +DEFINE_SPARSE_UNARY_KERNEL(Tan) +DEFINE_SPARSE_UNARY_KERNEL(Asin) +DEFINE_SPARSE_UNARY_KERNEL(Atan) +DEFINE_SPARSE_UNARY_KERNEL(Sinh) +DEFINE_SPARSE_UNARY_KERNEL(Tanh) +DEFINE_SPARSE_UNARY_KERNEL(Asinh) +DEFINE_SPARSE_UNARY_KERNEL(Atanh) +DEFINE_SPARSE_UNARY_KERNEL(Sqrt) +DEFINE_SPARSE_UNARY_KERNEL(Square) +DEFINE_SPARSE_UNARY_KERNEL(Log1p) +DEFINE_SPARSE_UNARY_KERNEL(Relu) +DEFINE_SPARSE_UNARY_KERNEL(Abs) +DEFINE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(Pow, factor) + +template +void ScaleCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + float scale, + float bias, + bool bias_after_scale, + SparseCooTensor* out) { + EmptyLikeCooKernel(dev_ctx, x, out); + phi::ScaleKernel(dev_ctx, + x.non_zero_elements(), + scale, + bias, + bias_after_scale, + out->mutable_non_zero_elements()); +} + +template +void ScaleCsrKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + float scale, + float bias, + bool bias_after_scale, + SparseCsrTensor* out) { + EmptyLikeCsrKernel(dev_ctx, x, out); + phi::ScaleKernel(dev_ctx, + x.non_zero_elements(), + scale, + bias, + bias_after_scale, + out->mutable_non_zero_elements()); +} + +template +void CastCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + DataType index_dtype, + DataType value_dtype, + SparseCooTensor* out) { + out->set_dims(x.dims()); + + const DenseTensor& x_indices = x.non_zero_indices(); + const DenseTensor& x_values = x.non_zero_elements(); + DenseTensor* out_indices = out->mutable_non_zero_indices(); + DenseTensor* out_values = out->mutable_non_zero_elements(); + + if (index_dtype == DataType::UNDEFINED) { + phi::Copy(dev_ctx, x_indices, dev_ctx.GetPlace(), false, out_indices); + } else { + phi::MetaTensor meta(out_indices); + meta.set_dims(x_indices.dims()); + meta.set_dtype(index_dtype); + + PD_VISIT_INTEGRAL_TYPES(x_indices.dtype(), "CastCooKernel", [&] { + phi::CastKernel( + dev_ctx, x_indices, index_dtype, out_indices); + }); + } + + if (value_dtype == DataType::UNDEFINED) { + phi::Copy(dev_ctx, x_values, dev_ctx.GetPlace(), false, out_values); + } else { + phi::MetaTensor meta(out_values); + meta.set_dims(x_values.dims()); + meta.set_dtype(value_dtype); + phi::CastKernel(dev_ctx, x_values, value_dtype, out_values); + } +} + +template +void CastCsrKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + DataType index_dtype, + DataType value_dtype, + SparseCsrTensor* out) { + out->set_dims(x.dims()); + + const DenseTensor& x_crows = x.non_zero_crows(); + const DenseTensor& x_cols = x.non_zero_cols(); + const DenseTensor& x_values = x.non_zero_elements(); + DenseTensor* out_crows = out->mutable_non_zero_crows(); + DenseTensor* out_cols = out->mutable_non_zero_cols(); + DenseTensor* out_values = out->mutable_non_zero_elements(); + + if (index_dtype == DataType::UNDEFINED) { + phi::Copy(dev_ctx, x_crows, dev_ctx.GetPlace(), false, out_crows); + phi::Copy(dev_ctx, x_cols, dev_ctx.GetPlace(), false, out_cols); + } else { + phi::MetaTensor crows_meta(out_crows); + crows_meta.set_dims(x_crows.dims()); + crows_meta.set_dtype(index_dtype); + + PD_VISIT_INTEGRAL_TYPES(x_crows.dtype(), "CastCsrKernel", [&] { + phi::CastKernel( + dev_ctx, x_crows, index_dtype, out_crows); + }); + + phi::MetaTensor cols_meta(out_cols); + cols_meta.set_dims(x_cols.dims()); + cols_meta.set_dtype(index_dtype); + + PD_VISIT_INTEGRAL_TYPES(x_cols.dtype(), "CastCsrKernel", [&] { + phi::CastKernel(dev_ctx, x_cols, index_dtype, out_cols); + }); + } + + if (value_dtype == DataType::UNDEFINED) { + phi::Copy(dev_ctx, x_values, dev_ctx.GetPlace(), false, out_values); + } else { + phi::MetaTensor meta(out_values); + meta.set_dims(x_values.dims()); + meta.set_dtype(value_dtype); + phi::CastKernel(dev_ctx, x_values, value_dtype, out_values); + } +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.cc b/paddle/phi/kernels/sparse/unary_grad_kernel.cc deleted file mode 100644 index cd844532e938f..0000000000000 --- a/paddle/phi/kernels/sparse/unary_grad_kernel.cc +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/sparse/unary_grad_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/core/sparse_csr_tensor.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/activation_grad_kernel.h" -#include "paddle/phi/kernels/empty_kernel.h" - -#define DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc) \ - namespace phi { \ - namespace sparse { \ - \ - template \ - void SparseCoo##DenseKernelFunc(const Context& dev_ctx, \ - const SparseCooTensor& x_or_out, \ - const SparseCooTensor& out_grad, \ - SparseCooTensor* x_grad) { \ - DenseTensor non_zero_indices = \ - phi::EmptyLike(dev_ctx, x_or_out.non_zero_indices()); \ - DenseTensor non_zero_elements = \ - phi::EmptyLike(dev_ctx, x_or_out.non_zero_elements()); \ - phi::Copy(dev_ctx, \ - x_or_out.non_zero_indices(), \ - dev_ctx.GetPlace(), \ - false, \ - &non_zero_indices); \ - phi::DenseKernelFunc(dev_ctx, \ - x_or_out.non_zero_elements(), \ - out_grad.non_zero_elements(), \ - &non_zero_elements); \ - x_grad->SetMember( \ - non_zero_indices, non_zero_elements, x_or_out.dims(), true); \ - } \ - \ - template \ - void SparseCsr##DenseKernelFunc(const Context& dev_ctx, \ - const SparseCsrTensor& x_or_out, \ - const SparseCsrTensor& out_grad, \ - SparseCsrTensor* out) { \ - DenseTensor non_zero_crows = \ - phi::EmptyLike(dev_ctx, x_or_out.non_zero_crows()); \ - DenseTensor non_zero_cols = \ - phi::EmptyLike(dev_ctx, x_or_out.non_zero_cols()); \ - DenseTensor non_zero_elements = \ - phi::EmptyLike(dev_ctx, x_or_out.non_zero_elements()); \ - phi::Copy(dev_ctx, \ - x_or_out.non_zero_crows(), \ - dev_ctx.GetPlace(), \ - false, \ - &non_zero_crows); \ - phi::Copy(dev_ctx, \ - x_or_out.non_zero_cols(), \ - dev_ctx.GetPlace(), \ - false, \ - &non_zero_cols); \ - phi::DenseKernelFunc(dev_ctx, \ - x_or_out.non_zero_elements(), \ - out_grad.non_zero_elements(), \ - &non_zero_elements); \ - out->SetMember( \ - non_zero_crows, non_zero_cols, non_zero_elements, x_or_out.dims()); \ - } \ - } \ - } - -#define REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \ - PD_REGISTER_KERNEL(sparse_coo_##kernel_name, \ - CPU, \ - ALL_LAYOUT, \ - phi::sparse::SparseCoo##DenseKernelFunc, \ - float, \ - double) { \ - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ - } \ - PD_REGISTER_KERNEL(sparse_csr_##kernel_name, \ - CPU, \ - ALL_LAYOUT, \ - phi::sparse::SparseCsr##DenseKernelFunc, \ - float, \ - double) { \ - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ - } - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#define REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \ - PD_REGISTER_KERNEL(sparse_coo_##kernel_name, \ - GPU, \ - ALL_LAYOUT, \ - phi::sparse::SparseCoo##DenseKernelFunc, \ - float, \ - double, \ - phi::dtype::float16) { \ - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ - } \ - \ - PD_REGISTER_KERNEL(sparse_csr_##kernel_name, \ - GPU, \ - ALL_LAYOUT, \ - phi::sparse::SparseCsr##DenseKernelFunc, \ - float, \ - double, \ - phi::dtype::float16) { \ - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ - } -#else -// This macro definition is empty when GPU is disabled -#define REGISTER_GPU_SPARSE_UNARY_KERNEL(sparse_kernel_name, DenseKernelFunc) -#endif - -#define REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \ - REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \ - REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) - -#define DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(kernel_name, \ - DenseKernelFunc) \ - DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc) \ - REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) - -// NOTE: the following code is to bypass the restriction of Paddle -// kernel registration mechanism. Do NOT refactor them unless you -// know what you are doing. -// If you want to implement any new kernel, please follow `sin_grad`, -// `tanh_grad` etc, do NOT follow the following `relu_grad`. -DEFINE_SPARSE_UNARY_GRAD_KERNEL(ReluGradKernel) - -PD_REGISTER_KERNEL(sparse_coo_relu_grad, - CPU, - ALL_LAYOUT, - phi::sparse::SparseCooReluGradKernel, - float, - double) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} -PD_REGISTER_KERNEL(sparse_csr_relu_grad, - CPU, - ALL_LAYOUT, - phi::sparse::SparseCsrReluGradKernel, - float, - double) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); -} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(sparse_coo_relu_grad, - GPU, - ALL_LAYOUT, - phi::sparse::SparseCooReluGradKernel, - float, - double, - phi::dtype::float16) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} - -PD_REGISTER_KERNEL(sparse_csr_relu_grad, - GPU, - ALL_LAYOUT, - phi::sparse::SparseCsrReluGradKernel, - float, - double, - phi::dtype::float16) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); -} -#endif - -DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(sin_grad, SinGradKernel) -DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(sqrt_grad, SqrtGradKernel) -DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(tanh_grad, TanhGradKernel) diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.h b/paddle/phi/kernels/sparse/unary_grad_kernel.h index 24ea4fee1a4fd..eb2cf9ed697e9 100644 --- a/paddle/phi/kernels/sparse/unary_grad_kernel.h +++ b/paddle/phi/kernels/sparse/unary_grad_kernel.h @@ -17,25 +17,65 @@ #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" -#define DECLARE_SPARSE_UNARY_GRAD_KERNEL(name) \ - template \ - void SparseCoo##name##GradKernel(const Context& dev_ctx, \ - const SparseCooTensor& x, \ - const SparseCooTensor& out_grad, \ - SparseCooTensor* x_grad); \ - \ - template \ - void SparseCsr##name##GradKernel(const Context& dev_ctx, \ - const SparseCsrTensor& x, \ - const SparseCsrTensor& out_grad, \ - SparseCsrTensor* x_grad); - namespace phi { namespace sparse { +#define DECLARE_SPARSE_UNARY_GRAD_KERNEL(prefix) \ + template \ + void prefix##CooGradKernel(const Context& dev_ctx, \ + const SparseCooTensor& x_or_out, \ + const SparseCooTensor& dout, \ + SparseCooTensor* dx); \ + \ + template \ + void prefix##CsrGradKernel(const Context& dev_ctx, \ + const SparseCsrTensor& x_or_out, \ + const SparseCsrTensor& dout, \ + SparseCsrTensor* dx); + +#define DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(prefix, attr) \ + template \ + void prefix##CooGradKernel(const Context& dev_ctx, \ + const SparseCooTensor& x_or_out, \ + const SparseCooTensor& dout, \ + float attr, \ + SparseCooTensor* dx); \ + \ + template \ + void prefix##CsrGradKernel(const Context& dev_ctx, \ + const SparseCsrTensor& x_or_out, \ + const SparseCsrTensor& dout, \ + float attr, \ + SparseCsrTensor* dx); + +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sin) +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Tan) +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Asin) +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Atan) +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sinh) +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Asinh) +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Atanh) DECLARE_SPARSE_UNARY_GRAD_KERNEL(Relu) +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Tanh) +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Square) DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sqrt) -DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sin) +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Log1p) +DECLARE_SPARSE_UNARY_GRAD_KERNEL(Abs) +DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor) + +template +void CastCooGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const SparseCooTensor& dout, + DataType value_dtype, + SparseCooTensor* dx); + +template +void CastCsrGradKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + const SparseCsrTensor& dout, + DataType value_dtype, + SparseCsrTensor* dx); } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/unary_kernel.cc b/paddle/phi/kernels/sparse/unary_kernel.cc deleted file mode 100644 index 2999536b34ee9..0000000000000 --- a/paddle/phi/kernels/sparse/unary_kernel.cc +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/sparse/unary_kernel.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/core/sparse_csr_tensor.h" -#include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/activation_kernel.h" -#include "paddle/phi/kernels/empty_kernel.h" - -#define DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc) \ - namespace phi { \ - namespace sparse { \ - \ - template \ - void SparseCoo##DenseKernelFunc(const Context& dev_ctx, \ - const SparseCooTensor& x, \ - SparseCooTensor* out) { \ - DenseTensor non_zero_indices = \ - phi::EmptyLike(dev_ctx, x.non_zero_indices()); \ - DenseTensor non_zero_elements = \ - phi::EmptyLike(dev_ctx, x.non_zero_elements()); \ - phi::Copy(dev_ctx, \ - x.non_zero_indices(), \ - dev_ctx.GetPlace(), \ - false, \ - &non_zero_indices); \ - phi::DenseKernelFunc( \ - dev_ctx, x.non_zero_elements(), &non_zero_elements); \ - out->SetMember(non_zero_indices, non_zero_elements, x.dims(), true); \ - } \ - \ - template \ - void SparseCsr##DenseKernelFunc(const Context& dev_ctx, \ - const SparseCsrTensor& x, \ - SparseCsrTensor* out) { \ - DenseTensor non_zero_crows = \ - phi::EmptyLike(dev_ctx, x.non_zero_crows()); \ - DenseTensor non_zero_cols = \ - phi::EmptyLike(dev_ctx, x.non_zero_cols()); \ - DenseTensor non_zero_elements = \ - phi::EmptyLike(dev_ctx, x.non_zero_elements()); \ - phi::Copy(dev_ctx, \ - x.non_zero_crows(), \ - dev_ctx.GetPlace(), \ - false, \ - &non_zero_crows); \ - phi::Copy(dev_ctx, \ - x.non_zero_cols(), \ - dev_ctx.GetPlace(), \ - false, \ - &non_zero_cols); \ - phi::DenseKernelFunc( \ - dev_ctx, x.non_zero_elements(), &non_zero_elements); \ - out->SetMember( \ - non_zero_crows, non_zero_cols, non_zero_elements, x.dims()); \ - } \ - } \ - } - -#define REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \ - PD_REGISTER_KERNEL(sparse_coo_##kernel_name, \ - CPU, \ - ALL_LAYOUT, \ - phi::sparse::SparseCoo##DenseKernelFunc, \ - float, \ - double) { \ - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ - } \ - PD_REGISTER_KERNEL(sparse_csr_##kernel_name, \ - CPU, \ - ALL_LAYOUT, \ - phi::sparse::SparseCsr##DenseKernelFunc, \ - float, \ - double) { \ - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ - } - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#define REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \ - PD_REGISTER_KERNEL(sparse_coo_##kernel_name, \ - GPU, \ - ALL_LAYOUT, \ - phi::sparse::SparseCoo##DenseKernelFunc, \ - float, \ - double, \ - phi::dtype::float16) { \ - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ - } \ - \ - PD_REGISTER_KERNEL(sparse_csr_##kernel_name, \ - GPU, \ - ALL_LAYOUT, \ - phi::sparse::SparseCsr##DenseKernelFunc, \ - float, \ - double, \ - phi::dtype::float16) { \ - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ - } -#else -// This macro definition is empty when GPU is disabled -#define REGISTER_GPU_SPARSE_UNARY_KERNEL(sparse_kernel_name, DenseKernelFunc) -#endif - -#define REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \ - REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \ - REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) - -#define DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \ - DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc) \ - REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) - -// NOTE: the following code is to bypass the restriction of Paddle -// kernel registration mechanism. Do NOT refactor them unless you -// know what you are doing. -// If you want to implement any new kernel, please follow `sin`, -// `tanh` etc, do NOT follow `sqrt`. -DEFINE_SPARSE_UNARY_KERNEL(SqrtKernel) - -PD_REGISTER_KERNEL(sparse_coo_sqrt, - CPU, - ALL_LAYOUT, - phi::sparse::SparseCooSqrtKernel, - float, - double) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} -PD_REGISTER_KERNEL(sparse_csr_sqrt, - CPU, - ALL_LAYOUT, - phi::sparse::SparseCsrSqrtKernel, - float, - double) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_REGISTER_KERNEL(sparse_coo_sqrt, - GPU, - ALL_LAYOUT, - phi::sparse::SparseCooSqrtKernel, - float, - double, - phi::dtype::float16) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); -} - -PD_REGISTER_KERNEL(sparse_csr_sqrt, - GPU, - ALL_LAYOUT, - phi::sparse::SparseCsrSqrtKernel, - float, - double, - phi::dtype::float16) { - kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); -} - -#endif - -DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(sin, SinKernel) -DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(tanh, TanhKernel) -DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(relu, ReluKernel) diff --git a/paddle/phi/kernels/sparse/unary_kernel.h b/paddle/phi/kernels/sparse/unary_kernel.h index 4470173c143db..fdb6b21a44427 100644 --- a/paddle/phi/kernels/sparse/unary_kernel.h +++ b/paddle/phi/kernels/sparse/unary_kernel.h @@ -14,35 +14,104 @@ #pragma once -#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" -#include "paddle/phi/kernels/activation_kernel.h" -#include "paddle/phi/kernels/empty_kernel.h" -#define DECLARE_SPARSE_UNARY_KERNEL(name) \ +namespace phi { +namespace sparse { + +#define DECLARE_SPARSE_UNARY_KERNEL(prefix) \ template \ - void SparseCoo##name##Kernel( \ + void prefix##CooKernel( \ const Context& dev_ctx, const SparseCooTensor& x, SparseCooTensor* out); \ \ template \ - void SparseCsr##name##Kernel( \ + void prefix##CsrKernel( \ const Context& dev_ctx, const SparseCsrTensor& x, SparseCsrTensor* out); -namespace phi { -namespace sparse { +#define DECLARE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(prefix, attr) \ + template \ + void prefix##CooKernel(const Context& dev_ctx, \ + const SparseCooTensor& x, \ + float attr, \ + SparseCooTensor* out); \ + \ + template \ + void prefix##CsrKernel(const Context& dev_ctx, \ + const SparseCsrTensor& x, \ + float attr, \ + SparseCsrTensor* out); +DECLARE_SPARSE_UNARY_KERNEL(Sin) +DECLARE_SPARSE_UNARY_KERNEL(Tan) +DECLARE_SPARSE_UNARY_KERNEL(Asin) +DECLARE_SPARSE_UNARY_KERNEL(Atan) +DECLARE_SPARSE_UNARY_KERNEL(Sinh) +DECLARE_SPARSE_UNARY_KERNEL(Asinh) +DECLARE_SPARSE_UNARY_KERNEL(Atanh) DECLARE_SPARSE_UNARY_KERNEL(Relu) +DECLARE_SPARSE_UNARY_KERNEL(Tanh) +DECLARE_SPARSE_UNARY_KERNEL(Square) DECLARE_SPARSE_UNARY_KERNEL(Sqrt) -DECLARE_SPARSE_UNARY_KERNEL(Sin) +DECLARE_SPARSE_UNARY_KERNEL(Log1p) +DECLARE_SPARSE_UNARY_KERNEL(Abs) +DECLARE_SPARSE_UNARY_KERNEL_WITH_ONE_ATTR(Pow, factor) + +template +void ScaleCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + float scale, + float bias, + bool bias_after_scale, + SparseCooTensor* out); + +template +void ScaleCsrKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + float scale, + float bias, + bool bias_after_scale, + SparseCsrTensor* out); template -SparseCooTensor SparseRelu(const Context& dev_ctx, const SparseCooTensor& x) { - DenseTensor indices, values; - SparseCooTensor coo(indices, values, x.dims()); - SparseCooReluKernel(dev_ctx, x, &coo); +void DivCooScalarKernel(const Context& dev_ctx, + const SparseCooTensor& x, + float scalar, + SparseCooTensor* out); + +template +void DivCsrScalarKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + float scalar, + SparseCsrTensor* out); + +template +void CastCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + DataType index_dtype, + DataType value_dtype, + SparseCooTensor* out); + +template +void CastCsrKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + DataType index_dtype, + DataType value_dtype, + SparseCsrTensor* out); + +template +SparseCooTensor ReluCoo(const Context& dev_ctx, const SparseCooTensor& x) { + SparseCooTensor coo; + ReluCooKernel(dev_ctx, x, &coo); return coo; } +template +SparseCooTensor ReluCsr(const Context& dev_ctx, const SparseCooTensor& x) { + SparseCooTensor csr; + ReluCsrKernel(dev_ctx, x, &csr); + return csr; +} + } // namespace sparse } // namespace phi diff --git a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc index 51d1e67f5af2a..9c6776fb2ac35 100644 --- a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc @@ -49,7 +49,7 @@ TEST(DEV_API, sparse_relu) { memcpy(dense_x.data(), data.data(), data.size() * sizeof(float)); auto sparse_coo = sparse::DenseToSparseCoo(dev_ctx_cpu, dense_x, 2); - auto sparse_out = sparse::SparseRelu(dev_ctx_cpu, sparse_coo); + auto sparse_out = sparse::ReluCoo(dev_ctx_cpu, sparse_coo); DenseTensor dense_out = phi::EmptyLike(dev_ctx_cpu, sparse_out.non_zero_elements()); ReluKernel(dev_ctx_cpu, sparse_coo.non_zero_elements(), &dense_out); @@ -69,7 +69,7 @@ TEST(DEV_API, sparse_relu) { SparseCooTensor sparse_out_grad( sparse_coo.non_zero_indices(), dense_out, {3, 4}); - sparse::SparseCooReluGradKernel( + sparse::ReluCooGradKernel( dev_ctx_cpu, sparse_coo, sparse_out_grad, &sparse_grad_x); cmp = memcmp(dense_grad_x.data(), diff --git a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py index 61932cf4a7b0a..12546ea463a84 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py @@ -125,16 +125,14 @@ def func_test_coo(self, op): def test_support_dtypes_csr(self): paddle.device.set_device('cpu') if paddle.device.get_device() == "cpu": - with _test_eager_guard(): - for op in op_list: - self.func_test_csr(op) + for op in op_list: + self.func_test_csr(op) def test_support_dtypes_coo(self): paddle.device.set_device('cpu') if paddle.device.get_device() == "cpu": - with _test_eager_guard(): - for op in op_list: - self.func_test_coo(op) + for op in op_list: + self.func_test_coo(op) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_sparse_model.py b/python/paddle/fluid/tests/unittests/test_sparse_model.py index 90f30e383174c..c070614fc708b 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_model.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_model.py @@ -62,3 +62,7 @@ def test(self): sparse_loss.backward() assert np.allclose(x.grad.numpy(), sparse_x.grad.to_dense().numpy()) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py index 2272022e8d6dc..36d64f5067263 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py @@ -12,137 +12,142 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function import unittest -from typing import Union, Callable import numpy as np import paddle -import paddle.fluid as fluid -from paddle.fluid.framework import _test_eager_guard -from paddle import _C_ops +from paddle.fluid.framework import convert_np_dtype_to_dtype_ class TestSparseUnary(unittest.TestCase): - def assert_raises_on_dense_tensor(self, sparse_func): - with _test_eager_guard(): - dense_x = paddle.ones((2, 3)) - with self.assertRaises(NotImplementedError): - sparse_func(dense_x) - - def compare_with_dense( - self, - x, - to_sparse: Callable[[paddle.Tensor], paddle.Tensor], - dense_func: Callable[[paddle.Tensor], paddle.Tensor], - sparse_func: Callable[[paddle.Tensor], paddle.Tensor], - test_gradient: bool, - ): - - def tensor_allclose(dense_tensor: paddle.Tensor, - sparse_tensor: paddle.Tensor): - dense_numpy = dense_tensor.numpy() - mask = ~np.isnan(dense_numpy) - return np.allclose(dense_numpy[mask], - sparse_tensor.to_dense().numpy()[mask]) - - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) - with _test_eager_guard(): - dense_x = paddle.to_tensor(x, - dtype="float32", - stop_gradient=not test_gradient) - - sparse_x = to_sparse(dense_x) - sparse_out = sparse_func(sparse_x) - - dense_x = paddle.to_tensor(x, - dtype="float32", - stop_gradient=not test_gradient) + def to_sparse(self, x, format): + if format == 'coo': + return x.detach().to_sparse_coo(sparse_dim=x.ndim) + elif format == 'csr': + return x.detach().to_sparse_csr() + + def check_result(self, dense_func, sparse_func, format, *args): + origin_x = paddle.rand([8, 16, 32], dtype='float32') + mask = paddle.randint(0, 2, [8, 16, 32]).astype('float32') + + ### check sparse coo with dense ### + dense_x = origin_x * mask + sp_x = self.to_sparse(dense_x, format) + + sp_x.stop_gradient = False + if len(args) == 0: + sp_out = sparse_func(sp_x) + elif len(args) == 1: + sp_out = sparse_func(sp_x, args[0]) + elif len(args) == 2: + sp_out = sparse_func(sp_x, args[0], args[1]) + sp_out.backward() + + dense_x.stop_gradient = False + if len(args) == 0: dense_out = dense_func(dense_x) + elif len(args) == 1: + dense_out = dense_func(dense_x, args[0]) + elif len(args) == 2: + if dense_func == paddle.cast: + dense_out = dense_func(dense_x, args[1]) + + int_dtype = convert_np_dtype_to_dtype_(args[0]) + if sp_out.is_sparse_csr(): + self.assertEqual(sp_out.crows().dtype, int_dtype) + self.assertEqual(sp_out.cols().dtype, int_dtype) + elif sp_out.is_sparse_coo(): + self.assertEqual(sp_out.indices().dtype, int_dtype) + else: + dense_out = dense_func(dense_x, args[0], args[1]) + dense_out.backward() + + # compare forward + self.assertTrue( + np.allclose(sp_out.to_dense().numpy(), dense_out.numpy())) + + # compare backward + if dense_func == paddle.sqrt: + expect_grad = np.nan_to_num(dense_x.grad.numpy(), 0., 0., 0.) + else: + expect_grad = (dense_x.grad * mask).numpy() + self.assertTrue(np.allclose(sp_x.grad.to_dense().numpy(), expect_grad)) + + def compare_with_dense(self, dense_func, sparse_func): + self.check_result(dense_func, sparse_func, 'coo') + self.check_result(dense_func, sparse_func, 'csr') + + def compare_with_dense_one_attr(self, dense_func, sparse_func, attr1): + self.check_result(dense_func, sparse_func, 'coo', attr1) + self.check_result(dense_func, sparse_func, 'csr', attr1) + + def compare_with_dense_two_attr(self, dense_func, sparse_func, attr1, + attr2): + self.check_result(dense_func, sparse_func, 'coo', attr1, attr2) + self.check_result(dense_func, sparse_func, 'csr', attr1, attr2) - assert tensor_allclose(dense_out, sparse_out) + def test_sparse_sin(self): + self.compare_with_dense(paddle.sin, paddle.incubate.sparse.sin) - if test_gradient: - dense_out.backward(dense_out) - sparse_out.backward(sparse_out) - assert tensor_allclose(dense_x.grad, sparse_x.grad) - fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) + def test_sparse_tan(self): + self.compare_with_dense(paddle.tan, paddle.incubate.sparse.tan) - def test_sparse_relu(self): - x = [[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]] - sparse_dim = 2 - self.compare_with_dense( - x, - lambda x: x.to_sparse_coo(sparse_dim), - paddle.nn.ReLU(), - paddle.incubate.sparse.nn.ReLU(), - True, - ) - self.compare_with_dense( - x, - lambda x: x.to_sparse_csr(), - paddle.nn.ReLU(), - paddle.incubate.sparse.nn.ReLU(), - False, - ) - self.assert_raises_on_dense_tensor(paddle.incubate.sparse.nn.ReLU()) + def test_sparse_asin(self): + self.compare_with_dense(paddle.asin, paddle.incubate.sparse.asin) - def test_sparse_sqrt(self): - x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]] - sparse_dim = 2 - self.compare_with_dense( - x, - lambda x: x.to_sparse_coo(sparse_dim), - paddle.sqrt, - paddle.incubate.sparse.sqrt, - True, - ) - self.compare_with_dense( - x, - lambda x: x.to_sparse_csr(), - paddle.sqrt, - paddle.incubate.sparse.sqrt, - False, - ) - self.assert_raises_on_dense_tensor(paddle.incubate.sparse.sqrt) + def test_sparse_atan(self): + self.compare_with_dense(paddle.atan, paddle.incubate.sparse.atan) - def test_sparse_sin(self): - x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]] - sparse_dim = 2 - self.compare_with_dense( - x, - lambda x: x.to_sparse_coo(sparse_dim), - paddle.sin, - paddle.incubate.sparse.sin, - True, - ) - self.compare_with_dense( - x, - lambda x: x.to_sparse_csr(), - paddle.sin, - paddle.incubate.sparse.sin, - False, - ) - self.assert_raises_on_dense_tensor(paddle.incubate.sparse.sin) + def test_sparse_sinh(self): + self.compare_with_dense(paddle.sinh, paddle.incubate.sparse.sinh) def test_sparse_tanh(self): - x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, -4, 2, 0]] - sparse_dim = 2 - self.compare_with_dense( - x, - lambda x: x.to_sparse_coo(sparse_dim), - paddle.tanh, - paddle.incubate.sparse.tanh, - True, - ) - self.compare_with_dense( - x, - lambda x: x.to_sparse_csr(), - paddle.tanh, - paddle.incubate.sparse.tanh, - False, - ) - self.assert_raises_on_dense_tensor(paddle.incubate.sparse.tanh) + self.compare_with_dense(paddle.tanh, paddle.incubate.sparse.tanh) + + def test_sparse_asinh(self): + self.compare_with_dense(paddle.asinh, paddle.incubate.sparse.asinh) + + def test_sparse_atanh(self): + self.compare_with_dense(paddle.atanh, paddle.incubate.sparse.atanh) + + def test_sparse_sqrt(self): + self.compare_with_dense(paddle.sqrt, paddle.incubate.sparse.sqrt) + + def test_sparse_square(self): + self.compare_with_dense(paddle.square, paddle.incubate.sparse.square) + + def test_sparse_log1p(self): + self.compare_with_dense(paddle.log1p, paddle.incubate.sparse.log1p) + + def test_sparse_relu(self): + self.compare_with_dense(paddle.nn.ReLU(), + paddle.incubate.sparse.nn.ReLU()) + + def test_sparse_abs(self): + self.compare_with_dense(paddle.abs, paddle.incubate.sparse.abs) + + def test_sparse_neg(self): + self.compare_with_dense(paddle.neg, paddle.incubate.sparse.neg) + + def test_sparse_pow(self): + self.compare_with_dense_one_attr(paddle.pow, paddle.incubate.sparse.pow, + 3) + + def test_sparse_mul_scalar(self): + self.compare_with_dense_one_attr(paddle.Tensor.__mul__, + paddle.incubate.sparse.multiply, 3) + + def test_sparse_div_scalar(self): + self.compare_with_dense_one_attr(paddle.Tensor.__div__, + paddle.incubate.sparse.divide, 2) + + def test_sparse_cast(self): + self.compare_with_dense_two_attr(paddle.cast, + paddle.incubate.sparse.cast, 'int16', + 'float32') + self.compare_with_dense_two_attr(paddle.cast, + paddle.incubate.sparse.cast, 'int32', + 'float64') if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py index a12425b69299e..ac69469cbbd69 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -38,7 +38,6 @@ def test_create_coo_by_tensor(self): dense_shape, stop_gradient=False) # test the to_string.py - print(coo) assert np.array_equal(indices, coo.indices().numpy()) assert np.array_equal(values, coo.values().numpy()) @@ -49,6 +48,7 @@ def test_create_coo_by_np(self): dense_shape = [3, 3] coo = paddle.incubate.sparse.sparse_coo_tensor( indices, values, dense_shape) + assert np.array_equal(3, coo.nnz()) assert np.array_equal(indices, coo.indices().numpy()) assert np.array_equal(values, coo.values().numpy()) @@ -78,7 +78,7 @@ def test_create_csr_by_np(self): csr = paddle.incubate.sparse.sparse_csr_tensor( crows, cols, values, dense_shape) # test the to_string.py - print(csr) + assert np.array_equal(5, csr.nnz()) assert np.array_equal(crows, csr.crows().numpy()) assert np.array_equal(cols, csr.cols().numpy()) assert np.array_equal(values, csr.values().numpy()) diff --git a/python/paddle/incubate/sparse/__init__.py b/python/paddle/incubate/sparse/__init__.py index f696434118745..c56ada3468acc 100644 --- a/python/paddle/incubate/sparse/__init__.py +++ b/python/paddle/incubate/sparse/__init__.py @@ -15,27 +15,50 @@ from .creation import sparse_coo_tensor from .creation import sparse_csr_tensor -from .unary import sqrt from .unary import sin +from .unary import tan +from .unary import asin +from .unary import atan +from .unary import sinh from .unary import tanh +from .unary import asinh +from .unary import atanh +from .unary import sqrt +from .unary import square +from .unary import log1p +from .unary import abs +from .unary import pow +from .unary import cast +from .unary import neg from .binary import mv from .binary import matmul from .binary import masked_matmul - -from .math import add -from .math import divide -from .math import multiply -from .math import subtract +from .binary import add +from .binary import divide +from .binary import multiply +from .binary import subtract from . import nn __all__ = [ 'sparse_coo_tensor', 'sparse_csr_tensor', - 'sqrt', 'sin', + 'tan', + 'asin', + 'atan', + 'sinh', 'tanh', + 'asinh', + 'atanh', + 'sqrt', + 'square', + 'log1p', + 'abs', + 'pow', + 'cast', + 'neg', 'mv', 'matmul', 'masked_matmul', diff --git a/python/paddle/incubate/sparse/binary.py b/python/paddle/incubate/sparse/binary.py index f34378924e1f4..0c90cd92a7537 100644 --- a/python/paddle/incubate/sparse/binary.py +++ b/python/paddle/incubate/sparse/binary.py @@ -13,10 +13,19 @@ # limitations under the License. from paddle import _C_ops -from paddle.fluid.framework import dygraph_only +from paddle.fluid.framework import dygraph_only, core __all__ = [] +_int_dtype_ = [ + core.VarDesc.VarType.UINT8, + core.VarDesc.VarType.INT8, + core.VarDesc.VarType.INT16, + core.VarDesc.VarType.INT32, + core.VarDesc.VarType.INT64, + core.VarDesc.VarType.BOOL, +] + @dygraph_only def matmul(x, y, name=None): @@ -197,3 +206,191 @@ def mv(x, vec, name=None): """ return _C_ops.final_state_sparse_mv(x, vec) + + +def add(x, y, name=None): + """ + Add two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse + type(SparseCooTensor or SparseCsrTensor).If input is SparseCooTensor, x and y's sparse_dim should be identical. + The equation is: + + .. math:: + out = x + y + + Args: + x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor: the result tensor. + + Examples: + + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + + paddle.device.set_device("cpu") + + with _test_eager_guard(): + x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32') + y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32') + sparse_x = x.to_sparse_csr() + sparse_y = y.to_sparse_csr() + sparse_z = paddle.incubate.sparse.add(sparse_x, sparse_y) + print(sparse_z.to_dense()) + + # [[ 0., -1., 0., 0.], + # [ 0., 2., -6., 0.], + # [ 6., 8., 4., 8.]] + + """ + if y.dtype != x.dtype: + y = _C_ops.final_state_sparse_cast(y, None, x.dtype) + return _C_ops.final_state_sparse_add(x, y) + + +@dygraph_only +def subtract(x, y, name=None): + """ + Subtract two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse + type(SparseCooTensor or SparseCsrTensor).If input is SparseCooTensor, x and y's sparse_dim should be identical. + The equation is: + + .. math:: + out = x - y + + Args: + x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor: the result tensor. + + Examples: + + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + + paddle.device.set_device("cpu") + + with _test_eager_guard(): + x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32') + y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32') + sparse_x = x.to_sparse_csr() + sparse_y = y.to_sparse_csr() + sparse_z = paddle.incubate.sparse.subtract(sparse_x, sparse_y) + print(sparse_z.to_dense()) + + # [[ 0., -1., 0., 4.], + # [ 0., -2., 0., 0.], + # [ 2., 2., -4., -8.]] + + """ + if y.dtype != x.dtype: + y = _C_ops.final_state_sparse_cast(y, None, x.dtype) + return _C_ops.final_state_sparse_subtract(x, y) + + +@dygraph_only +def multiply(x, y, name=None): + """ + Multiply two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse + type(SparseCooTensor or SparseCsrTensor).If input is SparseCooTensor, x and y's sparse_dim should be identical. + The equation is: + + .. math:: + out = x * y + + Args: + x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor: the result tensor. + + Examples: + + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + + paddle.device.set_device("cpu") + + with _test_eager_guard(): + x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32') + y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32') + sparse_x = x.to_sparse_csr() + sparse_y = y.to_sparse_csr() + sparse_z = paddle.incubate.sparse.multiply(sparse_x, sparse_y) + print(sparse_z.to_dense()) + + # [[ 0., 0., 0., -4.], + # [ 0., 0., 9., 0.], + # [ 8., 15., 0., 0.]] + + """ + if isinstance(y, (int, float)): + return _C_ops.final_state_sparse_scale(x, float(y), 0.0, True) + else: + if y.dtype != x.dtype: + y = _C_ops.final_state_sparse_cast(y, None, x.dtype) + return _C_ops.final_state_sparse_multiply(x, y) + + +@dygraph_only +def divide(x, y, name=None): + """ + Divide two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse + type(SparseCooTensor or SparseCsrTensor).If input is SparseCooTensor, x and y's sparse_dim should be identical. + The equation is: + + .. math:: + out = x / y + + Args: + x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Tensor: the result tensor. + + Examples: + + .. code-block:: python + + import paddle + from paddle.fluid.framework import _test_eager_guard + + paddle.device.set_device("cpu") + + with _test_eager_guard(): + x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32') + y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32') + sparse_x = x.to_sparse_csr() + sparse_y = y.to_sparse_csr() + sparse_z = paddle.incubate.sparse.divide(sparse_x, sparse_y) + print(sparse_z.to_dense()) + + # [[ nan , -inf. , nan , -1. ], + # [ nan , 0. , 1. , nan ], + # [ 2. , 1.66666663, 0. , 0. ]] + + """ + if x.dtype in _int_dtype_: + x = _C_ops.final_state_sparse_cast(x, None, core.VarDesc.VarType.FP32) + + if isinstance(y, (int, float)): + return _C_ops.final_state_sparse_divide_scalar(x, float(y)) + else: + if y.dtype != x.dtype: + y = _C_ops.final_state_sparse_cast(y, None, x.dtype) + return _C_ops.final_state_sparse_divide(x, y) diff --git a/python/paddle/incubate/sparse/math.py b/python/paddle/incubate/sparse/math.py deleted file mode 100644 index c6a984c3ad5be..0000000000000 --- a/python/paddle/incubate/sparse/math.py +++ /dev/null @@ -1,260 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -sparse math functions -""" -from __future__ import print_function - -from paddle import _C_ops, in_dynamic_mode, device, int32, int64 -from paddle.tensor import cast -from paddle.incubate.sparse import sparse_csr_tensor - - -def _cast_coo(x, dtype, name=None): - indices = x.indices() - values = cast(x.values(), dtype) - return _C_ops.final_state_sparse_create_sparse_coo_tensor( - values, indices, x.shape) - - -def _cast_csr(x, dtype, name=None): - crows = x.crows() - cols = x.cols() - values = cast(x.values(), dtype) - return sparse_csr_tensor(crows, cols, values, x.shape) - - -def _cast(x, dtype, name=None): - if x.is_sparse_coo(): - return _cast_coo(x, dtype, name) - return _cast_csr(x, dtype, name) - - -def add(x, y, name=None): - """ - Add two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse - type(SparseCooTensor or SparseCsrTensor).If input is SparseCooTensor, x and y's sparse_dim should be identical. - The equation is: - - .. math:: - out = x + y - - Args: - x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - - Returns: - Tensor: the result tensor. - - Examples: - - .. code-block:: python - - import paddle - from paddle.fluid.framework import _test_eager_guard - - paddle.device.set_device("cpu") - - with _test_eager_guard(): - x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32') - y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32') - sparse_x = x.to_sparse_csr() - sparse_y = y.to_sparse_csr() - sparse_z = paddle.incubate.sparse.add(sparse_x, sparse_y) - print(sparse_z.to_dense()) - - # [[ 0., -1., 0., 0.], - # [ 0., 2., -6., 0.], - # [ 6., 8., 4., 8.]] - - """ - assert device.get_device( - ) == "cpu", "Currently, Sparse add only support CPU device." - assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode" - assert x.is_sparse_csr() == y.is_sparse_csr( - ), f"Expect sparse tensor type to be same" - if x.is_sparse_coo() or x.is_sparse_csr(): - return _C_ops.final_state_sparse_add(x, y) - else: - raise ValueError( - "Currently, sparse.add only support the input of SparseCooTensor or SparseCsrTensor" - ) - - -def subtract(x, y, name=None): - """ - Subtract two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse - type(SparseCooTensor or SparseCsrTensor).If input is SparseCooTensor, x and y's sparse_dim should be identical. - The equation is: - - .. math:: - out = x - y - - Args: - x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - - Returns: - Tensor: the result tensor. - - Examples: - - .. code-block:: python - - import paddle - from paddle.fluid.framework import _test_eager_guard - - paddle.device.set_device("cpu") - - with _test_eager_guard(): - x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32') - y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32') - sparse_x = x.to_sparse_csr() - sparse_y = y.to_sparse_csr() - sparse_z = paddle.incubate.sparse.subtract(sparse_x, sparse_y) - print(sparse_z.to_dense()) - - # [[ 0., -1., 0., 4.], - # [ 0., -2., 0., 0.], - # [ 2., 2., -4., -8.]] - - """ - assert device.get_device( - ) == "cpu", "Currently, Sparse subtract only support CPU device." - assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode" - assert x.is_sparse_csr() == y.is_sparse_csr( - ), f"Expect sparse tensor type to be same" - if x.is_sparse_coo() or x.is_sparse_csr(): - return _C_ops.final_state_sparse_subtract(x, y) - else: - raise ValueError( - "Currently, sparse.subtract only support the input of SparseCooTensor or SparseCsrTensor" - ) - - -def multiply(x, y, name=None): - """ - Multiply two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse - type(SparseCooTensor or SparseCsrTensor).If input is SparseCooTensor, x and y's sparse_dim should be identical. - The equation is: - - .. math:: - out = x * y - - Args: - x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - - Returns: - Tensor: the result tensor. - - Examples: - - .. code-block:: python - - import paddle - from paddle.fluid.framework import _test_eager_guard - - paddle.device.set_device("cpu") - - with _test_eager_guard(): - x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32') - y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32') - sparse_x = x.to_sparse_csr() - sparse_y = y.to_sparse_csr() - sparse_z = paddle.incubate.sparse.multiply(sparse_x, sparse_y) - print(sparse_z.to_dense()) - - # [[ 0., 0., 0., -4.], - # [ 0., 0., 9., 0.], - # [ 8., 15., 0., 0.]] - - """ - assert device.get_device( - ) == "cpu", "Currently, Sparse multiply only support CPU device." - assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode" - assert x.is_sparse_csr() == y.is_sparse_csr( - ), f"Expect sparse tensor type to be same" - if x.is_sparse_coo() or x.is_sparse_csr(): - return _C_ops.final_state_sparse_multiply(x, y) - else: - raise ValueError( - "Currently, sparse.multiply only support the input of SparseCooTensor or SparseCsrTensor" - ) - - -def divide(x, y, name=None): - """ - Divide two sparse tensors element-wise. Input x and y's shape should be identical and have same sparse - type(SparseCooTensor or SparseCsrTensor).If input is SparseCooTensor, x and y's sparse_dim should be identical. - The equation is: - - .. math:: - out = x / y - - Args: - x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64. - name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - - Returns: - Tensor: the result tensor. - - Examples: - - .. code-block:: python - - import paddle - from paddle.fluid.framework import _test_eager_guard - - paddle.device.set_device("cpu") - - with _test_eager_guard(): - x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32') - y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32') - sparse_x = x.to_sparse_csr() - sparse_y = y.to_sparse_csr() - sparse_z = paddle.incubate.sparse.divide(sparse_x, sparse_y) - print(sparse_z.to_dense()) - - # [[ nan , -inf. , nan , -1. ], - # [ nan , 0. , 1. , nan ], - # [ 2. , 1.66666663, 0. , 0. ]] - - """ - assert device.get_device( - ) == "cpu", "Currently, Sparse divide only support CPU device." - assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode" - assert x.is_sparse_csr() == y.is_sparse_csr( - ), f"Expect sparse tensor type to be same" - - if x.dtype in [int32, int64]: - if x.is_sparse_coo() or x.is_sparse_csr(): - cx = _cast(x, 'float32') - cy = _cast(y, 'float32') - return _C_ops.final_state_sparse_divide(cx, cy) - else: - raise ValueError( - "Currently, sparse.divide only support the input of SparseCooTensor or SparseCsrTensor" - ) - else: - if x.is_sparse_coo() or x.is_sparse_csr(): - return _C_ops.final_state_sparse_divide(x, y) - else: - raise ValueError( - "Currently, sparse.divide only support the input of SparseCooTensor or SparseCsrTensor" - ) diff --git a/python/paddle/incubate/sparse/unary.py b/python/paddle/incubate/sparse/unary.py index 09e449b0d9c5e..d3fb55b73757a 100644 --- a/python/paddle/incubate/sparse/unary.py +++ b/python/paddle/incubate/sparse/unary.py @@ -13,19 +13,79 @@ # limitations under the License. from paddle import _C_ops -from paddle.fluid.framework import dygraph_only +from paddle.fluid.framework import dygraph_only, core, convert_np_dtype_to_dtype_ __all__ = [] @dygraph_only -def tanh(x, name=None): +def sin(x, name=None): """ - sparse tanh activation, requiring x to be a sparse coo or sparse csr tensor. + Calculate elementwise sin of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = sin(x) + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2., 0., 1.]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.sin(sparse_x) + + """ + return _C_ops.final_state_sparse_sin(x) + + +@dygraph_only +def tan(x, name=None): + """ + Calculate elementwise tan of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor. + .. math:: - out = tanh(x) + out = tan(x) + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2., 0., 1.]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.tan(sparse_x) + + """ + return _C_ops.final_state_sparse_tan(x) + + +@dygraph_only +def asin(x, name=None): + """ + Calculate elementwise asin of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = asin(x) Parameters: x (Tensor): The input Sparse Tensor with data type float32, float64. @@ -39,21 +99,200 @@ def tanh(x, name=None): .. code-block:: python import paddle - from paddle.fluid.framework import _test_eager_guard - with _test_eager_guard(): - dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32') - sparse_x = dense_x.to_sparse_coo(1) - out = paddle.incubate.sparse.tanh(sparse_x) + dense_x = paddle.to_tensor([-2., 0., 1.]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.asin(sparse_x) + + """ + return _C_ops.final_state_sparse_asin(x) + + +@dygraph_only +def atan(x, name=None): + """ + Calculate elementwise atan of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = atan(x) + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2., 0., 1.]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.atan(sparse_x) + + """ + return _C_ops.final_state_sparse_atan(x) + + +@dygraph_only +def sinh(x, name=None): + """ + Calculate elementwise sinh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = sinh(x) + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2., 0., 1.]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.sinh(sparse_x) + + """ + return _C_ops.final_state_sparse_sinh(x) + + +@dygraph_only +def asinh(x, name=None): + """ + Calculate elementwise asinh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = asinh(x) + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2., 0., 1.]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.asinh(sparse_x) + + """ + return _C_ops.final_state_sparse_asinh(x) + + +@dygraph_only +def atanh(x, name=None): + """ + Calculate elementwise atanh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = atanh(x) + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2., 0., 1.]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.atanh(sparse_x) + + """ + return _C_ops.final_state_sparse_atanh(x) + + +@dygraph_only +def tanh(x, name=None): + """ + Calculate elementwise tanh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = tanh(x) + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2., 0., 1.]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.tanh(sparse_x) + """ return _C_ops.final_state_sparse_tanh(x) @dygraph_only -def sqrt(x, name=None): +def square(x, name=None): """ - Calculate square root of x, requiring x to be a sparse coo or sparse csr tensor. + Calculate elementwise square of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = square(x) + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2., 0., 1.]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.square(sparse_x) + + """ + return _C_ops.final_state_sparse_square(x) + + +@dygraph_only +def sqrt(x, name=None): + """ + Calculate elementwise sqrt of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor. + .. math:: out = sqrt(x) @@ -70,24 +309,23 @@ def sqrt(x, name=None): .. code-block:: python import paddle - from paddle.fluid.framework import _test_eager_guard - with _test_eager_guard(): - dense_x = paddle.to_tensor([4, 0, 1], dtype='float32') - sparse_x = dense_x.to_sparse_coo(1) - out = paddle.incubate.sparse.sqrt(sparse_x) + dense_x = paddle.to_tensor([-2., 0., 1.]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.sqrt(sparse_x) + """ return _C_ops.final_state_sparse_sqrt(x) @dygraph_only -def sin(x, name=None): +def log1p(x, name=None): """ - Calculate sin of x, requiring x to be a sparse coo or sparse csr tensor. + Calculate the natural log of (1+x), requiring x to be a SparseCooTensor or SparseCsrTensor. .. math:: - out = sin(x) + out = ln(1+x) Parameters: x (Tensor): The input Sparse Tensor with data type float32, float64. @@ -101,11 +339,136 @@ def sin(x, name=None): .. code-block:: python import paddle - from paddle.fluid.framework import _test_eager_guard - with _test_eager_guard(): - dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32') - sparse_x = dense_x.to_sparse_coo(1) - out = paddle.incubate.sparse.sin(sparse_x) + dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32') + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.log1p(sparse_x) + """ - return _C_ops.final_state_sparse_sin(x) + return _C_ops.final_state_sparse_log1p(x) + + +@dygraph_only +def cast(x, index_dtype=None, value_dtype=None, name=None): + """ + cast non-zero-index of SparseTensor to `index_dtype`, non-zero-element of SparseTensor to + `value_dtype` , requiring x to be a SparseCooTensor or SparseCsrTensor. + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + index_dtype (np.dtype|str, optional): Data type of the index of SparseCooTensor, + or crows/cols of SparseCsrTensor. Can be uint8, int8, int16, int32, int64. + value_dtype (np.dtype|str, optional): Data type of the value of SparseCooTensor, + SparseCsrTensor. Can be bool, float16, float32, float64, int8, int32, int64, uint8. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2, 0, 1]) + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.cast(sparse_x, 'int32', 'float64') + + """ + if index_dtype and not isinstance(index_dtype, core.VarDesc.VarType): + index_dtype = convert_np_dtype_to_dtype_(index_dtype) + if value_dtype and not isinstance(value_dtype, core.VarDesc.VarType): + value_dtype = convert_np_dtype_to_dtype_(value_dtype) + return _C_ops.final_state_sparse_cast(x, index_dtype, value_dtype) + + +@dygraph_only +def pow(x, factor, name=None): + """ + Calculate elementwise pow of x, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = x^{factor} + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + factor (float|int): factor of pow. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32') + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.pow(sparse_x, 2) + + """ + return _C_ops.final_state_sparse_pow(x, float(factor)) + + +@dygraph_only +def neg(x, name=None): + """ + Calculate elementwise negative of x, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = -x + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32') + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.neg(sparse_x) + + """ + return _C_ops.final_state_sparse_scale(x, -1.0, 0.0, True) + + +@dygraph_only +def abs(x, name=None): + """ + Calculate elementwise absolute value of x, requiring x to be a SparseCooTensor or SparseCsrTensor. + + .. math:: + + out = |x| + + Parameters: + x (Tensor): The input Sparse Tensor with data type float32, float64. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + A Sparse Tensor with the same data type and shape as ``x`` . + + Examples: + .. code-block:: python + + import paddle + + dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32') + sparse_x = dense_x.to_sparse_coo(1) + out = paddle.incubate.sparse.abs(sparse_x) + + """ + return _C_ops.final_state_sparse_abs(x) From b0c9f24a3e0e71fa9106f0976fb43567913e5fee Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 12 Jul 2022 01:43:37 -0500 Subject: [PATCH 142/250] [PHI] Clean glog header in public header (#44216) * clean glog header in public header * move marco pos --- paddle/phi/backends/device_manager.cc | 2 ++ paddle/phi/backends/dynload/port.cc | 2 ++ paddle/phi/backends/dynload/port.h | 1 - paddle/phi/backends/gpu/gpu_info.cc | 1 + paddle/phi/backends/gpu/gpu_launch_config.h | 1 + paddle/phi/core/meta_tensor.cc | 2 ++ paddle/phi/core/meta_tensor.h | 1 - paddle/phi/infermeta/binary.cc | 2 ++ paddle/phi/infermeta/ternary.cc | 2 ++ paddle/phi/kernels/funcs/data_type_transform.h | 2 ++ paddle/phi/kernels/gpu/allclose_kernel.cu | 2 ++ paddle/phi/kernels/impl/einsum_impl.h | 2 ++ paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc | 2 ++ 13 files changed, 20 insertions(+), 2 deletions(-) diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc index ffaf42a0cf4e6..5b1022794a5c3 100644 --- a/paddle/phi/backends/device_manager.cc +++ b/paddle/phi/backends/device_manager.cc @@ -24,6 +24,8 @@ #include #include +#include "glog/logging.h" + namespace phi { void Device::CreateStream(stream::Stream* stream, diff --git a/paddle/phi/backends/dynload/port.cc b/paddle/phi/backends/dynload/port.cc index 5988417654890..d1b3da64c8570 100644 --- a/paddle/phi/backends/dynload/port.cc +++ b/paddle/phi/backends/dynload/port.cc @@ -18,6 +18,8 @@ #include #include +#include "glog/logging.h" + #if !defined(_WIN32) #include // dladdr #include diff --git a/paddle/phi/backends/dynload/port.h b/paddle/phi/backends/dynload/port.h index ed48553accb74..03a2863e4dc4e 100644 --- a/paddle/phi/backends/dynload/port.h +++ b/paddle/phi/backends/dynload/port.h @@ -17,7 +17,6 @@ #include #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#include "glog/logging.h" #if !defined(_WIN32) #include // dladdr diff --git a/paddle/phi/backends/gpu/gpu_info.cc b/paddle/phi/backends/gpu/gpu_info.cc index cc655f1822998..a2399554ba853 100644 --- a/paddle/phi/backends/gpu/gpu_info.cc +++ b/paddle/phi/backends/gpu/gpu_info.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_info.h" +#include #include #include "gflags/gflags.h" diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 04b2786c4d0fb..552f60783c8b2 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -30,6 +30,7 @@ #include #include +#include "glog/logging.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/enforce.h" diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc index 2178855aa0fee..f0cd841235ef1 100644 --- a/paddle/phi/core/meta_tensor.cc +++ b/paddle/phi/core/meta_tensor.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/phi/core/meta_tensor.h" +#include "glog/logging.h" + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/selected_rows.h" diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h index 271759161868b..377d0e9bc4d6d 100644 --- a/paddle/phi/core/meta_tensor.h +++ b/paddle/phi/core/meta_tensor.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include "glog/logging.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 1ba025e2c6252..269286d76d954 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -17,6 +17,8 @@ limitations under the License. */ #include #include +#include "glog/logging.h" + #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index a22f720b97e76..9f65de0f0aa70 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/phi/infermeta/ternary.h" +#include "glog/logging.h" + #include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/funcs/common_shape.h" diff --git a/paddle/phi/kernels/funcs/data_type_transform.h b/paddle/phi/kernels/funcs/data_type_transform.h index ad7f2aa192ce4..72fa94d4ed23d 100644 --- a/paddle/phi/kernels/funcs/data_type_transform.h +++ b/paddle/phi/kernels/funcs/data_type_transform.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include "glog/logging.h" + #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/cast_kernel.h" diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu index 8abc6b272c511..fa6a8fce0bf86 100644 --- a/paddle/phi/kernels/gpu/allclose_kernel.cu +++ b/paddle/phi/kernels/gpu/allclose_kernel.cu @@ -14,6 +14,8 @@ #include "paddle/phi/kernels/allclose_kernel.h" +#include "glog/logging.h" + #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h index 43b2760b404f9..b5bc826881af8 100644 --- a/paddle/phi/kernels/impl/einsum_impl.h +++ b/paddle/phi/kernels/impl/einsum_impl.h @@ -15,6 +15,8 @@ #include +#include "glog/logging.h" + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/matmul_kernel.h" #include "paddle/phi/kernels/reduce_sum_kernel.h" diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc index d9ebbd10267f5..972b4537b9554 100644 --- a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "glog/logging.h" + #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" From 636c6347c61cf338db0b9f40a3e65bd9998bcfed Mon Sep 17 00:00:00 2001 From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com> Date: Tue, 12 Jul 2022 15:03:02 +0800 Subject: [PATCH 143/250] fix_convfusion (#44226) --- paddle/fluid/framework/ir/graph_pattern_detector.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index e811475dd83e9..b0792ee0812c9 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2243,8 +2243,12 @@ PDNode *patterns::PriorBox::operator()() { return boxes_var; } +#if CUDNN_VERSION >= 8000 std::unordered_set conv_act_set( {"identity", "relu", "sigmoid", "tanh"}); +#else +std::unordered_set conv_act_set({"identity", "relu"}); +#endif PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { conv_in->AsInput(); From d55ee95fceba4373ee05a36d67ea23df872475a3 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Tue, 12 Jul 2022 15:16:32 +0800 Subject: [PATCH 144/250] [Phi] Migrate merged_adam_op into Phi (#44184) * remov merged_adam_op to phi * refine code --- .../operators/optimizers/merged_adam_op.cc | 30 ++- .../operators/optimizers/merged_adam_op.cu | 230 ------------------ .../operators/optimizers/merged_adam_op.h | 124 ---------- paddle/phi/infermeta/multiary.cc | 21 ++ paddle/phi/infermeta/multiary.h | 21 ++ paddle/phi/kernels/adam_kernel.h | 23 ++ paddle/phi/kernels/cpu/adam_kernel.cc | 104 ++++++++ paddle/phi/kernels/gpu/adam_kernel.cu | 112 +++++++++ paddle/phi/ops/compat/merged_adam_sig.cc | 47 ++++ 9 files changed, 345 insertions(+), 367 deletions(-) delete mode 100644 paddle/fluid/operators/optimizers/merged_adam_op.cu delete mode 100644 paddle/fluid/operators/optimizers/merged_adam_op.h create mode 100644 paddle/phi/ops/compat/merged_adam_sig.cc diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cc b/paddle/fluid/operators/optimizers/merged_adam_op.cc index 69ca8ec3c6670..f49fc72d01030 100644 --- a/paddle/fluid/operators/optimizers/merged_adam_op.cc +++ b/paddle/fluid/operators/optimizers/merged_adam_op.cc @@ -10,7 +10,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/merged_adam_op.h" +#include "paddle/fluid/framework/op_registry.h" + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -21,8 +25,6 @@ class MergedAdamOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override {} - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto param_dtype = @@ -128,13 +130,15 @@ param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsil } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(merged_adam, - ops::MergedAdamOp, - ops::MergedAdamOpMaker); -REGISTER_OP_WITHOUT_GRADIENT(merged_adamw, - ops::MergedAdamOp, - ops::MergedAdamOpMaker); - -REGISTER_OP_CPU_KERNEL(merged_adam, - ops::MergedAdamOpKernel, - ops::MergedAdamOpKernel); + +DECLARE_INFER_SHAPE_FUNCTOR(merged_adam, + MergedAdamInferMetaFunctor, + PD_INFER_META(phi::MergedAdamInferMeta)); + +REGISTER_OPERATOR( + merged_adam, + ops::MergedAdamOp, + ops::MergedAdamOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + MergedAdamInferMetaFunctor); diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.cu b/paddle/fluid/operators/optimizers/merged_adam_op.cu deleted file mode 100644 index 578c9864fa42d..0000000000000 --- a/paddle/fluid/operators/optimizers/merged_adam_op.cu +++ /dev/null @@ -1,230 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/optimizers/merged_adam_op.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" - -namespace paddle { -namespace operators { - -template -__global__ void AdamKernelREG(MT beta1, - MT beta2, - MT epsilon, - MT beta1_pow_, - MT beta2_pow_, - const MT* moment1, - MT* moment1_out, - const MT* moment2, - MT* moment2_out, - const MT* lr_, - const T* grad, - const T* param, - T* param_out, - const MT* master_param, - MT* master_param_out, - int ndim) { - MT lr = *lr_; - MT beta1_pow = beta1_pow_; - MT beta2_pow = beta2_pow_; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - for (; id < ndim; id += gridDim.x * blockDim.x) { - MT p = master_param ? master_param[id] : static_cast(param[id]); - MT g = static_cast(grad[id]); - MT mom1 = static_cast(moment1[id]); - MT mom2 = static_cast(moment2[id]); - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - moment1_out[id] = mom1; - moment2_out[id] = mom2; - param_out[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } -} - -template -__global__ void AdamKernelMEM(MT beta1, - MT beta2, - MT epsilon, - const MT* beta1_pow_, - const MT* beta2_pow_, - const MT* moment1, - MT* moment1_out, - const MT* moment2, - MT* moment2_out, - const MT* lr_, - const T* grad, - const T* param, - T* param_out, - const MT* master_param, - MT* master_param_out, - int ndim) { - MT lr = *lr_; - MT beta1_pow = *beta1_pow_; - MT beta2_pow = *beta2_pow_; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - for (; id < ndim; id += gridDim.x * blockDim.x) { - MT p = master_param ? master_param[id] : static_cast(param[id]); - MT g = static_cast(grad[id]); - MT mom1 = static_cast(moment1[id]); - MT mom2 = static_cast(moment2[id]); - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - moment1_out[id] = mom1; - moment2_out[id] = mom2; - param_out[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } -} - -template -__global__ void UpdateBetaPow(T beta1, - T beta2, - const T* beta1_pow_, - const T* beta2_pow_, - T* beta1_pow_out, - T* beta2_pow_out) { - *beta1_pow_out = beta1 * beta1_pow_[0]; - *beta2_pow_out = beta2 * beta2_pow_[0]; -} - -template -class MergedAdamOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using MPDType = typename details::MPTypeTrait::Type; - - auto param = ctx.MultiInput("Param"); - auto grad = ctx.MultiInput("Grad"); - auto lr = ctx.MultiInput("LearningRate"); - auto mom1 = ctx.MultiInput("Moment1"); - auto mom2 = ctx.MultiInput("Moment2"); - auto beta1_pow = ctx.MultiInput("Beta1Pow"); - auto beta2_pow = ctx.MultiInput("Beta2Pow"); - - auto param_out = ctx.MultiOutput("ParamOut"); - auto mom1_out = ctx.MultiOutput("Moment1Out"); - auto mom2_out = ctx.MultiOutput("Moment2Out"); - auto beta1_pow_out = ctx.MultiOutput("Beta1PowOut"); - auto beta2_pow_out = ctx.MultiOutput("Beta2PowOut"); - - MPDType beta1 = static_cast(ctx.Attr("beta1")); - MPDType beta2 = static_cast(ctx.Attr("beta2")); - MPDType epsilon = static_cast(ctx.Attr("epsilon")); - bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - const bool multi_precision = ctx.Attr("multi_precision"); - auto master_param = ctx.MultiInput("MasterParam"); - auto master_param_out = - ctx.MultiOutput("MasterParamOut"); - - auto& dev_ctx = ctx.template device_context(); - - size_t param_num = param.size(); - for (size_t idx = 0; idx < param_num; idx++) { - const MPDType* master_in_data = - multi_precision ? master_param[idx]->data() : nullptr; - MPDType* master_out_data = - multi_precision - ? master_param_out[idx]->mutable_data(ctx.GetPlace()) - : nullptr; - - // update param and moment - int threads = 512; - int blocks = (param[idx]->numel() + threads - 1) / threads; - - if (beta1_pow[idx]->place() == platform::CPUPlace() && - beta2_pow[idx]->place() == platform::CPUPlace()) { - // Compute with betapow in REG - AdamKernelREG<<>>( - beta1, - beta2, - epsilon, - *beta1_pow[idx]->data(), - *beta2_pow[idx]->data(), - mom1[idx]->data(), - mom1_out[idx]->mutable_data(ctx.GetPlace()), - mom2[idx]->data(), - mom2_out[idx]->mutable_data(ctx.GetPlace()), - lr[idx]->data(), - grad[idx]->data(), - param[idx]->data(), - param_out[idx]->mutable_data(ctx.GetPlace()), - master_in_data, - master_out_data, - param[idx]->numel()); - if (!use_global_beta_pow) { - // Cpu update - beta1_pow_out[idx]->mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow[idx]->data()[0]; - beta2_pow_out[idx]->mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow[idx]->data()[0]; - } - } else { - AdamKernelMEM<<>>( - beta1, - beta2, - epsilon, - beta1_pow[idx]->data(), - beta2_pow[idx]->data(), - mom1[idx]->data(), - mom1_out[idx]->mutable_data(ctx.GetPlace()), - mom2[idx]->data(), - mom2_out[idx]->mutable_data(ctx.GetPlace()), - lr[idx]->data(), - grad[idx]->data(), - param[idx]->data(), - param_out[idx]->mutable_data(ctx.GetPlace()), - master_in_data, - master_out_data, - param[idx]->numel()); - if (!use_global_beta_pow) { - // Update with gpu - UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1, - beta2, - beta1_pow[idx]->data(), - beta2_pow[idx]->data(), - beta1_pow_out[idx]->mutable_data(ctx.GetPlace()), - beta2_pow_out[idx]->mutable_data(ctx.GetPlace())); - } - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(merged_adam, - ops::MergedAdamOpCUDAKernel, - ops::MergedAdamOpCUDAKernel, - ops::MergedAdamOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.h b/paddle/fluid/operators/optimizers/merged_adam_op.h deleted file mode 100644 index 3b7c8ab0286c3..0000000000000 --- a/paddle/fluid/operators/optimizers/merged_adam_op.h +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/phi/kernels/funcs/adam_functors.h" - -namespace paddle { -namespace operators { - -namespace scatter = paddle::operators::math::scatter; - -template -class MergedAdamOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto param = ctx.MultiInput("Param"); - size_t n = param.size(); - auto grad = ctx.MultiInput("Grad"); - PADDLE_ENFORCE_EQ(n, - grad.size(), - platform::errors::InvalidArgument( - "The size of Input(Grad) must be equal to " - "Input(Param), but got the size of Input(Grad) " - "is %d, the size of Input(Param) is %d.", - grad.size(), - n)); - auto lr = ctx.MultiInput("LearningRate"); - PADDLE_ENFORCE_EQ( - n, - lr.size(), - platform::errors::InvalidArgument( - "The size of Input(LearningRate) must be equal to " - "Input(Param), but got the size of Input(LearningRate) " - "is %d, the size of Input(Param) is %d.", - lr.size(), - n)); - auto mom1 = ctx.MultiInput("Moment1"); - PADDLE_ENFORCE_EQ(n, - mom1.size(), - platform::errors::InvalidArgument( - "The size of Input(Moment1) must be equal to " - "Input(Param), but got the size of Input(Moment1) " - "is %d, the size of Input(Param) is %d.", - mom1.size(), - n)); - auto mom2 = ctx.MultiInput("Moment2"); - PADDLE_ENFORCE_EQ(n, - mom2.size(), - platform::errors::InvalidArgument( - "The size of Input(Moment2) must be equal to " - "Input(Param), but got the size of Input(Moment2) " - "is %d, the size of Input(Param) is %d.", - mom2.size(), - n)); - auto beta1_pow = ctx.MultiInput("Beta1Pow"); - PADDLE_ENFORCE_EQ(n, - beta1_pow.size(), - platform::errors::InvalidArgument( - "The size of Input(Beta1Pow) must be equal to " - "Input(Param), but got the size of Input(Beta1Pow) " - "is %d, the size of Input(Param) is %d.", - beta1_pow.size(), - n)); - auto beta2_pow = ctx.MultiInput("Beta2Pow"); - PADDLE_ENFORCE_EQ(n, - beta2_pow.size(), - platform::errors::InvalidArgument( - "The size of Input(Beta2Pow) must be equal to " - "Input(Param), but got the size of Input(Beta2Pow) " - "is %d, the size of Input(Param) is %d.", - beta2_pow.size(), - n)); - - auto param_out = ctx.MultiOutput("ParamOut"); - auto mom1_out = ctx.MultiOutput("Moment1Out"); - auto mom2_out = ctx.MultiOutput("Moment2Out"); - auto beta1_pow_out = ctx.MultiOutput("Beta1PowOut"); - auto beta2_pow_out = ctx.MultiOutput("Beta2PowOut"); - - T beta1 = static_cast(ctx.Attr("beta1")); - T beta2 = static_cast(ctx.Attr("beta2")); - T epsilon = static_cast(ctx.Attr("epsilon")); - bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - size_t param_num = param.size(); - for (size_t idx = 0; idx < param_num; idx++) { - phi::funcs::AdamFunctor functor( - beta1, - beta2, - epsilon, - beta1_pow[idx]->data(), - beta2_pow[idx]->data(), - mom1[idx]->data(), - mom1_out[idx]->mutable_data(ctx.GetPlace()), - mom2[idx]->data(), - mom2_out[idx]->mutable_data(ctx.GetPlace()), - lr[idx]->data(), - grad[idx]->data(), - param[idx]->data(), - param_out[idx]->mutable_data(ctx.GetPlace())); - functor(param[idx]->numel()); - if (!use_global_beta_pow) { - beta1_pow_out[idx]->mutable_data(ctx.GetPlace())[0] = - beta1 * beta1_pow[idx]->data()[0]; - beta2_pow_out[idx]->mutable_data(ctx.GetPlace())[0] = - beta2 * beta2_pow[idx]->data()[0]; - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 61c57981f94b5..575e60923cd21 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1528,6 +1528,27 @@ void LogspaceInferMeta(const MetaTensor& start, out->set_dtype(start.dtype()); } +void MergedAdamInferMeta( + const std::vector& param, + const std::vector& grad, + const std::vector& learning_rate, + const std::vector& moment1, + const std::vector& moment2, + const std::vector& beta1_pow, + const std::vector& beta2_pow, + const paddle::optional>& master_param, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool multi_precision, + bool use_global_beta_pow, + std::vector param_out, + std::vector moment1_out, + std::vector moment2_out, + std::vector beta1_pow_out, + std::vector beta2_pow_out, + std::vector master_param_out) {} + void MeshgridInferMeta(const std::vector& inputs, std::vector outputs) { const size_t inputs_num = inputs.size(); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 54c6fccceb9c1..c0972816f3ba2 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -234,6 +234,27 @@ void LogspaceInferMeta(const MetaTensor& start, const MetaTensor& base, MetaTensor* out); +void MergedAdamInferMeta( + const std::vector& param, + const std::vector& grad, + const std::vector& learning_rate, + const std::vector& moment1, + const std::vector& moment2, + const std::vector& beta1_pow, + const std::vector& beta2_pow, + const paddle::optional>& master_param, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool multi_precision, + bool use_global_beta_pow, + std::vector param_out, + std::vector moment1_out, + std::vector moment2_out, + std::vector beta1_pow_out, + std::vector beta2_pow_out, + std::vector master_param_out); + void MeshgridInferMeta(const std::vector& inputs, std::vector outputs); diff --git a/paddle/phi/kernels/adam_kernel.h b/paddle/phi/kernels/adam_kernel.h index 0bdf05f8e5123..b1a7f5a686530 100644 --- a/paddle/phi/kernels/adam_kernel.h +++ b/paddle/phi/kernels/adam_kernel.h @@ -44,4 +44,27 @@ void AdamDenseKernel(const Context& dev_ctx, DenseTensor* beta2_pow_out, DenseTensor* master_param_outs); +template +void MergedAdamKernel( + const Context& dev_ctx, + const std::vector& param, + const std::vector& grad, + const std::vector& learning_rate, + const std::vector& moment1, + const std::vector& moment2, + const std::vector& beta1_pow, + const std::vector& beta2_pow, + const paddle::optional>& master_param, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool multi_precision, + bool use_global_beta_pow, + std::vector param_out, + std::vector moment1_out, + std::vector moment2_out, + std::vector beta1_pow_out, + std::vector beta2_pow_out, + std::vector master_param_out); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/adam_kernel.cc b/paddle/phi/kernels/cpu/adam_kernel.cc index 03e2a539640ea..03a75bd36156f 100644 --- a/paddle/phi/kernels/cpu/adam_kernel.cc +++ b/paddle/phi/kernels/cpu/adam_kernel.cc @@ -167,7 +167,111 @@ void AdamDenseKernel(const Context& dev_ctx, } } +template +void MergedAdamKernel( + const Context& dev_ctx, + const std::vector& param, + const std::vector& grad, + const std::vector& learning_rate, + const std::vector& moment1, + const std::vector& moment2, + const std::vector& beta1_pow, + const std::vector& beta2_pow, + const paddle::optional>& master_param, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool multi_precision, + bool use_global_beta_pow, + std::vector param_out, + std::vector moment1_out, + std::vector moment2_out, + std::vector beta1_pow_out, + std::vector beta2_pow_out, + std::vector master_param_out) { + size_t param_num = param.size(); + PADDLE_ENFORCE_EQ( + param_num, + grad.size(), + errors::InvalidArgument("The size of Input(grad) must be equal to " + "Input(param), but got the size of Input(grad) " + "is %d, the size of Input(param) is %d.", + grad.size(), + param_num)); + PADDLE_ENFORCE_EQ( + param_num, + learning_rate.size(), + errors::InvalidArgument( + "The size of Input(learning_rate) must be equal to " + "Input(param), but got the size of Input(learning_rate) " + "is %d, the size of Input(param) is %d.", + learning_rate.size(), + param_num)); + PADDLE_ENFORCE_EQ(param_num, + moment1.size(), + errors::InvalidArgument( + "The size of Input(moment1) must be equal to " + "Input(param), but got the size of Input(moment1) " + "is %d, the size of Input(param) is %d.", + moment1.size(), + param_num)); + PADDLE_ENFORCE_EQ(param_num, + moment2.size(), + errors::InvalidArgument( + "The size of Input(moment2) must be equal to " + "Input(param), but got the size of Input(moment2) " + "is %d, the size of Input(param) is %d.", + moment2.size(), + param_num)); + PADDLE_ENFORCE_EQ(param_num, + beta1_pow.size(), + errors::InvalidArgument( + "The size of Input(beta1_pow) must be equal to " + "Input(param), but got the size of Input(beta1_pow) " + "is %d, the size of Input(param) is %d.", + beta1_pow.size(), + param_num)); + PADDLE_ENFORCE_EQ(param_num, + beta2_pow.size(), + errors::InvalidArgument( + "The size of Input(beta2_pow) must be equal to " + "Input(param), but got the size of Input(beta2_pow) " + "is %d, the size of Input(param) is %d.", + beta2_pow.size(), + param_num)); + T beta1_ = beta1.to(); + T beta2_ = beta2.to(); + T epsilon_ = epsilon.to(); + + for (size_t idx = 0; idx < param_num; idx++) { + phi::funcs::AdamFunctor functor( + beta1_, + beta2_, + epsilon_, + beta1_pow[idx]->data(), + beta2_pow[idx]->data(), + moment1[idx]->data(), + dev_ctx.template Alloc(moment1_out[idx]), + moment2[idx]->data(), + dev_ctx.template Alloc(moment2_out[idx]), + learning_rate[idx]->data(), + grad[idx]->data(), + param[idx]->data(), + dev_ctx.template Alloc(param_out[idx])); + functor(param[idx]->numel()); + if (!use_global_beta_pow) { + dev_ctx.template Alloc(beta1_pow_out[idx])[0] = + beta1_ * beta1_pow[idx]->data()[0]; + dev_ctx.template Alloc(beta2_pow_out[idx])[0] = + beta2_ * beta2_pow[idx]->data()[0]; + } + } +} + } // namespace phi PD_REGISTER_KERNEL(adam, CPU, ALL_LAYOUT, phi::AdamDenseKernel, float, double) { } + +PD_REGISTER_KERNEL( + merged_adam, CPU, ALL_LAYOUT, phi::MergedAdamKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu index 59aa4cf597e86..b20e8610fefaf 100644 --- a/paddle/phi/kernels/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/gpu/adam_kernel.cu @@ -265,6 +265,106 @@ void AdamDenseKernel(const Context& dev_ctx, } } +template +void MergedAdamKernel( + const Context& dev_ctx, + const std::vector& param, + const std::vector& grad, + const std::vector& learning_rate, + const std::vector& moment1, + const std::vector& moment2, + const std::vector& beta1_pow, + const std::vector& beta2_pow, + const paddle::optional>& master_param, + const Scalar& beta1, + const Scalar& beta2, + const Scalar& epsilon, + bool multi_precision, + bool use_global_beta_pow, + std::vector param_out, + std::vector moment1_out, + std::vector moment2_out, + std::vector beta1_pow_out, + std::vector beta2_pow_out, + std::vector master_param_out) { + using MPDType = typename phi::dtype::MPTypeTrait::Type; + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + MPDType beta1_ = beta1.to(); + MPDType beta2_ = beta2.to(); + MPDType epsilon_ = epsilon.to(); + + size_t param_num = param.size(); + + for (size_t idx = 0; idx < param_num; idx++) { + const MPDType* master_in_data = + multi_precision ? master_param.get()[idx]->data() : nullptr; + MPDType* master_out_data = + multi_precision ? dev_ctx.template Alloc(master_param_out[idx]) + : nullptr; + + // update param and moment + int threads = 512; + int blocks = (param[idx]->numel() + threads - 1) / threads; + + if (beta1_pow[idx]->place() == CPUPlace() && + beta2_pow[idx]->place() == CPUPlace()) { + // Compute with betapow in REG + AdamKernelREG<<>>( + beta1_, + beta2_, + epsilon_, + *beta1_pow[idx]->data(), + *beta2_pow[idx]->data(), + moment1[idx]->data(), + dev_ctx.template Alloc(moment1_out[idx]), + moment2[idx]->data(), + dev_ctx.template Alloc(moment2_out[idx]), + learning_rate[idx]->data(), + grad[idx]->data(), + param[idx]->data(), + dev_ctx.template Alloc(param_out[idx]), + master_in_data, + master_out_data, + param[idx]->numel()); + if (!use_global_beta_pow) { + // Cpu update + dev_ctx.template HostAlloc(beta1_pow_out[idx])[0] = + beta1_ * beta1_pow[idx]->data()[0]; + dev_ctx.template HostAlloc(beta2_pow_out[idx])[0] = + beta2_ * beta2_pow[idx]->data()[0]; + } + } else { + AdamKernelMEM<<>>( + beta1_, + beta2_, + epsilon_, + beta1_pow[idx]->data(), + beta2_pow[idx]->data(), + moment1[idx]->data(), + dev_ctx.template Alloc(moment1_out[idx]), + moment2[idx]->data(), + dev_ctx.template Alloc(moment2_out[idx]), + learning_rate[idx]->data(), + grad[idx]->data(), + param[idx]->data(), + dev_ctx.template Alloc(param_out[idx]), + master_in_data, + master_out_data, + param[idx]->numel()); + if (!use_global_beta_pow) { + // Update with gpu + UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1_, + beta2_, + beta1_pow[idx]->data(), + beta2_pow[idx]->data(), + dev_ctx.template Alloc(beta1_pow_out[idx]), + dev_ctx.template Alloc(beta2_pow_out[idx])); + } + } + } +} + } // namespace phi PD_REGISTER_KERNEL(adam, @@ -279,3 +379,15 @@ PD_REGISTER_KERNEL(adam, kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); kernel->InputAt(8).SetBackend(phi::Backend::ALL_BACKEND); } + +PD_REGISTER_KERNEL(merged_adam, + GPU, + ALL_LAYOUT, + phi::MergedAdamKernel, + float, + double, + phi::dtype::float16) { + // Skip beta1_pow, beta2_pow data transform + kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/ops/compat/merged_adam_sig.cc b/paddle/phi/ops/compat/merged_adam_sig.cc new file mode 100644 index 0000000000000..38f56bad08d85 --- /dev/null +++ b/paddle/phi/ops/compat/merged_adam_sig.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include + +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/utils/small_vector.h" + +namespace phi { + +KernelSignature MergedAdamOpArgumentMapping(const ArgumentMappingContext& ctx) { + paddle::small_vector in_names = {"Param", + "Grad", + "LearningRate", + "Moment1", + "Moment2", + "Beta1Pow", + "Beta2Pow", + "MasterParam"}; + paddle::small_vector out_names = {"ParamOut", + "Moment1Out", + "Moment2Out", + "Beta1PowOut", + "Beta2PowOut", + "MasterParamOut"}; + paddle::small_vector attr_names = { + "beta1", "beta2", "epsilon", "multi_precision", "use_global_beta_pow"}; + + return KernelSignature("merged_adam", + std::move(in_names), + std::move(attr_names), + std::move(out_names)); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(merged_adam, phi::MergedAdamOpArgumentMapping); From 5061d3dbf771458d5cc3d998b5deb95da61a171c Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Tue, 12 Jul 2022 15:49:30 +0800 Subject: [PATCH 145/250] [MLU] fix sync copy bugs (#44127) --- paddle/fluid/operators/mlu/mlu_baseop.cc | 15 ++------ paddle/fluid/operators/mlu/mlu_baseop.h | 4 +-- paddle/fluid/operators/randperm_op_mlu.cc | 27 ++++++++++++-- paddle/fluid/operators/where_index_op_mlu.cc | 38 +++++++++++--------- 4 files changed, 52 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 175fa9f94470f..95a365f459f18 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -4274,21 +4274,12 @@ MLURNNDesc::~MLURNNDesc() { /* static */ void MLUCnnl::NumTrue(const ExecutionContext& ctx, const cnnlTensorDescriptor_t x_desc, const void* x, - Tensor index, - uint32_t* num_true) { + const cnnlTensorDescriptor_t num_true_desc, + void* num_true) { cnnlHandle_t handle = GetHandleFromCTX(ctx); - size_t workspace_size = 0; PADDLE_ENFORCE_MLU_SUCCESS( - cnnlGetNumTrueWorkspaceSize(handle, x_desc, &workspace_size)); - - auto& dev_ctx = GetDevCtxFromCTX(ctx); - index = ctx.AllocateTmpTensor( - {static_cast(workspace_size)}, dev_ctx); - void* index_ptr = index.mutable_data(ctx.GetPlace()); - - PADDLE_ENFORCE_MLU_SUCCESS(cnnlNumTrue( - handle, x_desc, x, static_cast(index_ptr), num_true)); + cnnlNumTrue_v2(handle, x_desc, x, num_true_desc, num_true)); } /* static */ void MLUCnnl::Where(const ExecutionContext& ctx, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 0d4c7d2e5a329..72446f56a18dc 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -1703,8 +1703,8 @@ class MLUCnnl { static void NumTrue(const ExecutionContext& ctx, const cnnlTensorDescriptor_t x_desc, const void* x, - Tensor index, - uint32_t* num_true); + const cnnlTensorDescriptor_t num_true_desc, + void* num_true); static void Where(const ExecutionContext& ctx, const cnnlTensorDescriptor_t x_desc, diff --git a/paddle/fluid/operators/randperm_op_mlu.cc b/paddle/fluid/operators/randperm_op_mlu.cc index 0d4fbf2d12f7c..a3ebf8f5c00fc 100644 --- a/paddle/fluid/operators/randperm_op_mlu.cc +++ b/paddle/fluid/operators/randperm_op_mlu.cc @@ -15,9 +15,32 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/randperm_op.h" +namespace paddle { +namespace operators { + +template +class RandpermMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int n = ctx.Attr("n"); + unsigned int seed = static_cast(ctx.Attr("seed")); + framework::Variable* out_var = ctx.OutputVar("Out"); + framework::Tensor* out_tensor = + framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var); + + framework::Tensor tmp_tensor; + tmp_tensor.Resize(phi::make_ddim({n})); + T* tmp_data = tmp_tensor.mutable_data(platform::CPUPlace()); + random_permate(tmp_data, n, seed); + framework::TensorCopySync(tmp_tensor, ctx.GetPlace(), out_tensor); + } +}; + +} // namespace operators +} // namespace paddle + template -using kernel = - paddle::operators::RandpermKernel; +using kernel = paddle::operators::RandpermMLUKernel; REGISTER_OP_MLU_KERNEL( randperm, kernel, kernel, kernel, kernel); diff --git a/paddle/fluid/operators/where_index_op_mlu.cc b/paddle/fluid/operators/where_index_op_mlu.cc index d0699521aa46e..389f7960bcdc1 100644 --- a/paddle/fluid/operators/where_index_op_mlu.cc +++ b/paddle/fluid/operators/where_index_op_mlu.cc @@ -30,30 +30,36 @@ class MLUWhereIndexKernel : public framework::OpKernel { auto* out = context.Output("Out"); auto dims = condition->dims(); const int rank = dims.size(); - std::vector true_num = {0}; - std::vector vec_condition; - paddle::framework::TensorToVector( - *condition, context.device_context(), &vec_condition); - int vec_con_size = vec_condition.size(); - for (int i = 0; i < vec_con_size; ++i) { - if (vec_condition[i] > 0) true_num[0]++; - } - out->Resize(phi::make_ddim({true_num[0], rank})); + Tensor num_true; + num_true.mutable_data({1}, context.GetPlace()); + MLUCnnlTensorDesc con_desc(*condition); + MLUCnnlTensorDesc num_true_desc(num_true); + MLUCnnl::NumTrue(context, + con_desc.get(), + GetBasePtr(condition), + num_true_desc.get(), + GetBasePtr(&num_true)); + + Tensor local_true_num; + paddle::framework::TensorCopySync( + num_true, platform::CPUPlace(), &local_true_num); + auto true_num = *local_true_num.data(); + + out->Resize(phi::make_ddim({true_num, rank})); out->mutable_data(context.GetPlace()); + + if (true_num == 0) { + return; + } + auto& dev_ctx = context.template device_context(); framework::Tensor out_int32 = context.AllocateTmpTensor(out->dims(), dev_ctx); - Tensor num_true; - paddle::framework::TensorFromVector( - true_num, context.device_context(), &num_true); - num_true.mutable_data(context.GetPlace()); - bool as_tuple = false; - MLUCnnlTensorDesc con_desc(*condition); - MLUCnnlTensorDesc num_true_desc(num_true); MLUCnnlTensorDesc out_int32_desc(out_int32); MLUCnnlTensorDesc out_desc(*out); + bool as_tuple = false; MLUCnnl::Where(context, con_desc.get(), GetBasePtr(condition), From 40b686308676c0c05961971bb47b345543f10234 Mon Sep 17 00:00:00 2001 From: fuyou765 <64373205+fuyou765@users.noreply.github.com> Date: Tue, 12 Jul 2022 15:55:07 +0800 Subject: [PATCH 146/250] [MLU] fix expand_v2 scatter tile where ctest bugs (#44220) --- .../paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py | 3 ++- python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py | 3 ++- python/paddle/fluid/tests/unittests/mlu/test_tile_op_mlu.py | 3 ++- python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py index d7b1768d50970..cbc99c2fa6686 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_expand_v2_op_mlu.py @@ -25,6 +25,8 @@ import paddle from paddle.fluid.framework import _test_eager_guard +paddle.enable_static() + # Situation 1: shape is a list(without tensor) class TestExpandV2OpRank1(OpTest): @@ -304,5 +306,4 @@ def test_expand_times_is_tensor(self): if __name__ == "__main__": - paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py index 0725a27e5125a..d901813e3482a 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_scatter_op_mlu.py @@ -25,6 +25,8 @@ import paddle.fluid.core as core from paddle.fluid.dygraph.base import switch_to_static_graph +paddle.enable_static() + class TestScatterOp(OpTest): @@ -243,5 +245,4 @@ def executed_api(self): if __name__ == "__main__": - paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_tile_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_tile_op_mlu.py index 1a2f5dbd40eb6..7c1e227ba2c07 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_tile_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_tile_op_mlu.py @@ -24,6 +24,8 @@ import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard +paddle.enable_static() + #Situation 1: repeat_times is a list (without tensor) class TestTileOpRank1(OpTest): @@ -277,5 +279,4 @@ def test_api(self): if __name__ == "__main__": - paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py index 682a9e3909cc7..3f1d553f7386e 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_where_op_mlu.py @@ -29,6 +29,8 @@ from paddle.fluid.backward import append_backward from paddle.fluid.framework import _test_eager_guard +paddle.enable_static() + class TestWhereOp(OpTest): @@ -396,5 +398,4 @@ def test_value_error(self): if __name__ == "__main__": - paddle.enable_static() unittest.main() From 3333a43986d72c39e6685abef28b2be43fc0fc4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C5=82awomir=20Siwek?= Date: Tue, 12 Jul 2022 10:05:02 +0200 Subject: [PATCH 147/250] matmul+activation fuse pass (#43519) * add method for post ops * format code * gpd * format style * add matmul+act test * implement matmul+activation * whitespaces * code style * python code format * Increase UT timeout * code format * update style * generalize activation fuse passes * change order * Unify activation GPD * Revert changes with op_act * remove softmax mkldnn attrs * set common name for act attributes * whitespace * append postops by helper function * ut style * revert changes related to quantization * Reduce redundancy * reduce number of parameters * trigger CI * validate attribute * trim unit test --- paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../conv_activation_mkldnn_fuse_pass.cc | 1 - .../matmul_activation_mkldnn_fuse_pass.cc | 281 ++++++++++++++++++ .../matmul_activation_mkldnn_fuse_pass.h | 41 +++ ...plus_activation_mkldnn_fuse_pass_tester.cc | 4 +- .../inference/api/paddle_pass_builder.cc | 1 + .../operators/mkldnn/matmul_mkldnn_op.cc | 3 + ...test_mkldnn_matmul_activation_fuse_pass.py | 127 ++++++++ 8 files changed, 456 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 2e4b73c6ac19a..d31555bf7247c 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -212,6 +212,7 @@ if(WITH_MKLDNN) pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn) pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn) pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn) pass_library(cpu_quantize_placement_pass base DIR mkldnn) pass_library(cpu_quantize_pass inference DIR mkldnn) pass_library(cpu_quantize_squash_pass inference DIR mkldnn) diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc index 8c140e8132489..bd07967757b8a 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc @@ -26,7 +26,6 @@ using string::PrettyLogDetail; void ConvActivationMkldnnFusePass::ApplyImpl(Graph* graph) const { auto act_types = paddle::platform::GetSupportedActivations(); - std::vector conv_types = {"conv2d"}; for (const auto& conv_type : conv_types) diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000000..80f49c97e8465 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.cc @@ -0,0 +1,281 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h" + +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +void MatmulActivationMkldnnFusePass::ApplyImpl(Graph* graph) const { + auto act_types = paddle::platform::GetSupportedActivations(); + std::vector matmul_types = {"matmul"}; + + for (const auto& matmul_type : matmul_types) + for (auto& act_type : act_types) { + FuseMatmulAct(graph, matmul_type, act_type); + } +} + +void MatmulActivationMkldnnFusePass::FuseMatmulAct( + Graph* graph, const std::string& matmul_type, std::string& act_type) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + FusePassBase::Init(matmul_type + "_" + act_type + "_mkldnn_fuse_pass", graph); + + GraphPatternDetector gpd; + patterns::OperatorActivation matmul_act_pattern( + gpd.mutable_pattern(), "matmul_activation_mkldnn_fuse"); + matmul_act_pattern(matmul_type, act_type); + + int found_matmul_activation_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle " + matmul_type + "+" + act_type + " fuse"; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "matmul_activation_mkldnn_fuse_pass op compat failed."; + return; + } + + GET_IR_NODE_FROM_SUBGRAPH(matmul, preceding_op, matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, preceding_op_out, matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(activation, activation, matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH( + activation_out, activation_out, matmul_act_pattern); + + OpDesc* matmul_op = matmul->Op(); + OpDesc* act_op = activation->Op(); + + auto attr_map = paddle::platform::GetAttributeMap(act_type); + for (const auto& attrs : attr_map) { + if (act_op->HasAttr(attrs.first)) { + matmul_op->SetAttr(attrs.second, act_op->GetAttr(attrs.first)); + } + } + + if (act_type == "gelu" && activation->Op()->HasAttr("approximate")) { + act_type = BOOST_GET_CONST(bool, activation->Op()->GetAttr("approximate")) + ? "gelu_tanh" + : "gelu_erf"; + } + matmul_op->SetAttr("fuse_activation", act_type); + matmul_op->SetOutput("Out", {activation_out->Name()}); + + IR_NODE_LINK_TO(matmul, activation_out); + GraphSafeRemoveNodes(graph, {activation, matmul_out}); + found_matmul_activation_count++; + }; + + gpd(graph, handler); + AddStatis(found_matmul_activation_count); + if (!Has("disable_logs") || !Get("disable_logs")) { + PrettyLogDetail("--- fused %d matmul with %s activation", + found_matmul_activation_count, + act_type); + } +} + +MatmulActivationMkldnnFusePass::MatmulActivationMkldnnFusePass() { + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsType() + .End() + .AddAttr("transpose_X") + .IsType() + .End() + .AddAttr("transpose_Y") + .IsType() + .End(); + + AddOpCompat(OpCompat("abs")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("clip")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("min") + .End() + .AddAttr("max") + .End(); + + AddOpCompat(OpCompat("gelu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("approximate") + .IsType() + .End(); + + AddOpCompat(OpCompat("hard_sigmoid")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("slope") + .IsOptional() + .IsType() + .End() + .AddAttr("offset") + .IsOptional() + .IsType() + .End(); + + AddOpCompat(OpCompat("hard_swish")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("threshold") + .IsOptional() + .IsType() + .End() + .AddAttr("scale") + .IsOptional() + .IsType() + .End() + .AddAttr("offset") + .IsOptional() + .IsType() + .End(); + + AddOpCompat(OpCompat("leaky_relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsType() + .End(); + + AddOpCompat(OpCompat("mish")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("relu6")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("threshold") + .IsType() + .End(); + + AddOpCompat(OpCompat("sigmoid")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("sqrt")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("swish")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("beta") + .IsType() + .End(); + + AddOpCompat(OpCompat("tanh")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(matmul_activation_mkldnn_fuse_pass, + paddle::framework::ir::MatmulActivationMkldnnFusePass); + +REGISTER_PASS_CAPABILITY(matmul_activation_mkldnn_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("matmul", 1) + .EQ("abs", 0) + .LE("clip", 1) + .EQ("gelu", 0) + .EQ("hard_sigmoid", 0) + .LE("hard_swish", 0) + .LE("leaky_relu", 1) + .LE("mish", 1) + .EQ("relu", 0) + .EQ("relu6", 0) + .EQ("sigmoid", 0) + .EQ("sqrt", 0) + .EQ("swish", 0) + .EQ("tanh", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h new file mode 100644 index 0000000000000..ebef63e292438 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/matmul_activation_mkldnn_fuse_pass.h @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { + +class MatmulActivationMkldnnFusePass : public FusePassBase { + public: + MatmulActivationMkldnnFusePass(); + virtual ~MatmulActivationMkldnnFusePass() {} + + protected: + void ApplyImpl(Graph *graph) const override; + + void FuseMatmulAct(Graph *graph, + const std::string &matmul_type, + std::string &act_type) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc index 2fbb46e32d1e9..afe3d75fd2126 100644 --- a/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/softplus_activation_mkldnn_fuse_pass_tester.cc @@ -50,9 +50,9 @@ void MainTest(const std::string& activation_type) { const auto* op = node->Op(); ASSERT_TRUE(op->HasAttr("use_mkldnn")); EXPECT_TRUE(BOOST_GET_CONST(bool, op->GetAttr("use_mkldnn"))); - ASSERT_TRUE(op->HasAttr("fuse_activation_type")); + ASSERT_TRUE(op->HasAttr("fuse_activation")); auto activation_type = - BOOST_GET_CONST(std::string, op->GetAttr("fuse_activation_type")); + BOOST_GET_CONST(std::string, op->GetAttr("fuse_activation")); EXPECT_EQ(activation_type.compare(activation_type), 0); } } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 0d918446ea92a..3642a28790aec 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -302,6 +302,7 @@ void CpuPassStrategy::EnableMKLDNN() { "softplus_activation_mkldnn_fuse_pass", // "shuffle_channel_mkldnn_detect_pass", // "elt_act_mkldnn_fuse_pass", // + "matmul_activation_mkldnn_fuse_pass", // // TODO(intel): Please fix the bug on windows. // https://github.com/PaddlePaddle/Paddle/issues/29710 // "mkldnn_inplace_pass", // This pass should be activated after diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc index 9ab09c3f3cecc..912b1be813a58 100644 --- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" using dnnl::memory; using dnnl::primitive; @@ -453,6 +454,8 @@ class MatMulMKLDNNHandler matmul_attrs.set_output_scales(0, {scale_out}); } + paddle::platform::AppendActivation(ctx, post_operations); + matmul_attrs.set_post_ops(post_operations); return matmul_attrs; } diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py new file mode 100644 index 0000000000000..20028fb335b8f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py @@ -0,0 +1,127 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import PassAutoScanTest +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +from functools import partial +import unittest +import hypothesis.strategies as st + + +class TestMatmulActivationMkldnnFusePass(PassAutoScanTest): + + def sample_program_config(self, draw): + transpose_X = draw(st.booleans()) + transpose_Y = draw(st.booleans()) + alpha = draw(st.sampled_from([1, 2])) + batch_size = draw(st.sampled_from([4])) + channel = draw(st.sampled_from([8])) + input_dim = draw(st.sampled_from([32])) + activation_type = draw( + st.sampled_from([ + 'relu', 'gelu', 'tanh', 'sigmoid', 'swish', 'mish', 'sqrt', + 'hard_swish', 'sigmoid', 'abs', 'relu6', 'clip', 'tanh', + 'hard_sigmoid', 'leaky_relu' + ])) + + def generate_input(type): + if transpose_X and transpose_Y: + shape_x = [batch_size, channel, input_dim, 32] + shape_y = [batch_size, channel, 64, input_dim] + elif transpose_X: + shape_x = [batch_size, channel, input_dim, 32] + shape_y = [batch_size, channel, input_dim, 64] + elif transpose_Y: + shape_x = [batch_size, channel, 32, input_dim] + shape_y = [batch_size, channel, 8, input_dim] + else: + shape_x = [batch_size, channel, 32, input_dim] + shape_y = [batch_size, channel, input_dim, 16] + + if type == 'x': + return np.random.random(shape_x).astype(np.float32) + else: + return np.random.random(shape_y).astype(np.float32) + + matmul_op = OpConfig(type='matmul', + inputs={ + 'X': ['matmul_X'], + 'Y': ['matmul_Y'] + }, + outputs={'Out': ['matmul_output']}, + attrs={ + 'transpose_X': transpose_X, + 'transpose_Y': transpose_Y, + 'alpha': alpha + }) + + if activation_type == "relu6": + activation_op = OpConfig(activation_type, + inputs={"X": ["matmul_output"]}, + outputs={"Out": ["activation_output"]}, + threshold=draw( + st.floats(min_value=1.0, + max_value=10.0))) + elif activation_type == "leaky_relu": + activation_op = OpConfig(activation_type, + inputs={"X": ["matmul_output"]}, + outputs={"Out": ["activation_output"]}, + alpha=draw( + st.floats(min_value=0.1, + max_value=1.0))) + elif activation_type == "swish": + activation_op = OpConfig(activation_type, + inputs={"X": ["matmul_output"]}, + outputs={"Out": ["activation_output"]}, + beta=draw( + st.floats(min_value=0.1, + max_value=1.0))) + elif activation_type == "clip": + activation_op = OpConfig( + activation_type, + inputs={"X": ["matmul_output"]}, + outputs={"Out": ["activation_output"]}, + min=draw(st.floats(min_value=0.1, max_value=0.49)), + max=draw(st.floats(min_value=0.5, max_value=1.0))) + else: + activation_op = OpConfig(activation_type, + inputs={"X": ["matmul_output"]}, + outputs={"Out": ["activation_output"]}) + + model_net = [matmul_op, activation_op] + + program_config = ProgramConfig( + ops=model_net, + weights={}, + inputs={ + 'matmul_X': TensorConfig(data_gen=partial(generate_input, 'x')), + 'matmul_Y': TensorConfig(data_gen=partial(generate_input, 'y')) + }, + outputs=['activation_output']) + + return program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, ['matmul'], (1e-5, 1e-5) + + def test(self): + self.run_and_statis(quant=False, + max_examples=30, + passes=['matmul_activation_mkldnn_fuse_pass']) + + +if __name__ == '__main__': + unittest.main() From e379455a1ea6f5264dcb8326dadebc381540439b Mon Sep 17 00:00:00 2001 From: caozhou <48191911+Caozhou1995@users.noreply.github.com> Date: Tue, 12 Jul 2022 16:32:13 +0800 Subject: [PATCH 148/250] =?UTF-8?q?=E3=80=90Auto=20Parallel=E3=80=91update?= =?UTF-8?q?=20base=20cost=20(#44095)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update base cost * update unittest of cost model * add unittest --- .../auto_parallel/cost/base_cost.py | 365 +++++++++++++++--- .../unittests/auto_parallel/CMakeLists.txt | 1 + .../unittests/auto_parallel/test_base_cost.py | 234 +++++++++++ .../unittests/auto_parallel/test_cluster.py | 4 + .../unittests/auto_parallel/test_comm_cost.py | 4 + .../auto_parallel/test_new_cost_model.py | 28 +- 6 files changed, 586 insertions(+), 50 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py diff --git a/python/paddle/distributed/auto_parallel/cost/base_cost.py b/python/paddle/distributed/auto_parallel/cost/base_cost.py index 4455d6f66483b..deac76e45a8b0 100644 --- a/python/paddle/distributed/auto_parallel/cost/base_cost.py +++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py @@ -17,8 +17,12 @@ import paddle -from ..cluster import LinkType +from ..utils import _get_comm_group, _get_corresponding_rank from ..process_group import get_process_group +from ..cluster import LinkType +from ..dist_tensor import DistributedTensor +from ..utils import _get_idx_in_axis +from ..dist_tensor import DistributedTensor COMM_OP_TYPE = [ "send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum", @@ -28,33 +32,22 @@ _g_op_cost_factory = {} -def build_comm_desc(op_type, group_ranks, dtype, shape, attrs=None): - desc = {} - desc["op"] = op_type - desc["group_ranks"] = group_ranks - desc["inputs"] = {"X": [(dtype, shape)]} - if attrs is not None: - desc["attrs"] = attrs - return desc - +def build_comp_desc_from_op(op): + """Build the description of computation op.""" + # NOTE: The desc is for serial op. + from ..reshard import get_var_with_recursion -def _parse_op_to_desc(op, dist_context=None): desc = {} - desc["op"] = op.type + # The desc of concat op is {"op": "concat", "inputs": {"X": [(paddle.float32, [20, 20]), (paddle.float32, [20, 20])]}, "outputs": {"Out": [(paddle.float32, [20, 40])], "attrs": {"axis": -1}}} vars = op.block.vars + desc["op"] = op.type input_desc = OrderedDict() for input_name in op.input_names: var_name_list = op.input(input_name) var_desc = [] for var_name in var_name_list: - var = vars[var_name] - shape = None - if dist_context is not None: - dist_tensor = dist_context.get_dist_tensor_for_program(var) - shape = dist_tensor.local_sizes() - else: - shape = var.shape - assert shape is not None + var = get_var_with_recursion(var_name, op.block, op.block.program) + shape = var.shape var_desc.append((var.dtype, shape)) input_desc[input_name] = var_desc desc["inputs"] = input_desc @@ -64,14 +57,8 @@ def _parse_op_to_desc(op, dist_context=None): var_name_list = op.output(out_name) var_desc = [] for var_name in var_name_list: - var = vars[var_name] - shape = None - if dist_context is not None: - dist_tensor = dist_context.get_dist_tensor_for_program(var) - shape = dist_tensor.local_sizes() - else: - shape = var.shape - assert shape is not None + var = get_var_with_recursion(var_name, op.block, op.block.program) + shape = var.shape var_desc.append((var.dtype, shape)) output_desc[out_name] = var_desc desc["outputs"] = output_desc @@ -82,19 +69,101 @@ def _parse_op_to_desc(op, dist_context=None): return desc -def parse_to_desc(op=None, dist_op=None, dist_context=None): - desc = None - if op is None and dist_op is not None and dist_context is not None: - desc = _parse_op_to_desc(op=dist_op.serial_op, - dist_context=dist_context) - elif op is not None and dist_op is None and dist_context is None: - desc = _parse_op_to_desc(op) - - return desc - - -def parse_desc_to_str(desc): - +def build_comp_desc_from_dist_op(dist_op, dist_context): + """Build descriptions of computation op distributed on the processes.""" + from ..reshard import get_var_with_recursion + + op_descs = {} + op = dist_op.serial_op + dist_attr = dist_op.dist_attr + process_mesh = dist_attr.process_mesh + assert process_mesh, "Process mesh must not be None." + processes = process_mesh.processes + for process in processes: + desc = {} + desc["op"] = op.type + attr_desc = op.all_attrs() + # NOTE: The attrs of desc is replica of serial op, there may be a bug if shape need to be partitioned involved in attrs. + desc["attrs"] = attr_desc + input_desc = OrderedDict() + output_desc = OrderedDict() + + # Get partitioned shape of input + for input_name in op.input_names: + var_name_list = op.input(input_name) + var_desc = [] + for var_name in var_name_list: + var = get_var_with_recursion(var_name, op.block, + op.block.program) + # Use op input_dims_mapping + dims_mapping = dist_attr.get_input_dims_mapping(var_name) + global_sizes = var.shape + # NOTE: When support uneven partition, the shard_sizes will be got from dist_attr. + shard_sizes = None + topology = process_mesh.topology + shape = DistributedTensor.get_local_sizes( + global_sizes, dims_mapping, topology, processes, process, + shard_sizes) + var_desc.append((var.dtype, shape)) + + # For special op such as embedding and its grad op + if op.type == "c_embedding" or op.type == "lookup_table_v2" or op.type == "c_embedding_grad" or op.type == "lookup_table_v2_grad": + if input_name == "W": + embedding_row_dim_mapping = dist_attr.get_input_dims_mapping( + op.input(input_name)[0])[0] + relative_idx = _get_idx_in_axis( + processes, dist_attr.process_mesh.topology, + embedding_row_dim_mapping, process) + per_part_size = shape[0] + relative_idx = relative_idx * per_part_size + desc["attrs"]["start_index"] = relative_idx + + input_desc[input_name] = var_desc + desc["inputs"] = input_desc + + for out_name in op.output_names: + var_name_list = op.output(out_name) + var_desc = [] + for var_name in var_name_list: + # Use op output_dims_mapping + var = get_var_with_recursion(var_name, op.block, + op.block.program) + dist_attr = dist_op.dist_attr + dims_mapping = dist_attr.get_output_dims_mapping(var_name) + process_mesh = dist_attr.process_mesh + global_sizes = var.shape + shard_sizes = None + processes = process_mesh.processes + topology = process_mesh.topology + shape = DistributedTensor.get_local_sizes( + global_sizes, dims_mapping, topology, processes, process, + shard_sizes) + var_desc.append((var.dtype, shape)) + + # For special op such as fill_constant_batch_size_like + if op.type == "fill_constant_batch_size_like": + # Modify shape attr according to how output are partitioned + out_name = var_name_list[0] + dims_mapping = dist_attr.get_output_dims_mapping(out_name) + process_mesh_shape = dist_attr.process_mesh.topology + shape_list = op.attr("shape") + # Modify target shape + for idx, axis in enumerate(dims_mapping): + if axis >= 0: + shape_list[idx] = shape_list[ + idx] // process_mesh_shape[axis] + desc["attrs"]["shape"] = shape_list + output_desc[out_name] = var_desc + + desc["outputs"] = output_desc + + op_descs[process] = desc + + return op_descs + + +def build_comp_desc_str_for_predict(desc): + # NOTE: The description format may change in the future. def _parse_dtype(dtype): dtype_str = "" if dtype == paddle.float32: @@ -135,8 +204,208 @@ def _parse_dtype(dtype): shape_str = "[" + ",".join(shape_list) + "]" desc_str_list += [dtype_str, dims_str, shape_str] desc_str = "_".join(desc_str_list) + attrs = desc["attrs"] + parse_result = (desc_str, attrs) + return parse_result + + +def build_comm_desc_from_dist_op(op_type, + dist_op, + ctx, + var_names, + attrs=None, + parallel_axis=None, + group_ranks=None): + """Build descriptions of communication op distributed on the processes.""" + from ..reshard import get_var_with_recursion + + specific_op_type = [] + dist_attr = dist_op.dist_attr + assert dist_attr, "Dist attr must not be None." + process_mesh = dist_attr.process_mesh + assert process_mesh, "Process mesh must not be None." + + processes = process_mesh.processes + op_descs = {} + for process in processes: + rank_id = process + desc = {} + desc["op"] = op_type + op_attrs = None + comm_group_ranks = None + + if op_type not in specific_op_type: + serial_op = dist_op.serial_op + input_list = [] + # The var_names usually contain just one item. + for var_name in var_names: + dist_attr = dist_op.dist_attr + has_found = False + # Find var_name in serial op input or output + for name in dist_op.serial_op.input_arg_names: + # If a tensor is the input of multi ops, sum the grad of all ops, so the name will be varname@RENAME@block@0 and so on. + if var_name in name: + var_name = name + has_found = True + break + + if not has_found: + for name in dist_op.serial_op.output_arg_names: + if var_name in name: + var_name = name + has_found = True + break + assert has_found + var = get_var_with_recursion(var_name, serial_op.block, + serial_op.block.program) + + dims_mapping = dist_attr.get_input_dims_mapping( + var_name + ) if var_name in dist_op.serial_op.input_arg_names else dist_attr.get_output_dims_mapping( + var_name) + global_sizes = var.shape + shard_sizes = None + topology = process_mesh.topology + shape = DistributedTensor.get_local_sizes( + global_sizes, dims_mapping, topology, processes, process, + shard_sizes) + input_list.append((var.dtype, shape)) + + # NOTE: The input_name of comm ops used usually is X. + desc["inputs"] = {"X": input_list} + + # Get comm group by parallel_axis or the given group_ranks. + if parallel_axis is not None: + process_mesh_shape = process_mesh.topology + process_mesh_group = process_mesh.processes + comm_group_ranks = _get_comm_group(process_mesh_group, + process_mesh_shape, + parallel_axis, rank_id) + elif group_ranks is not None: + comm_group_ranks = group_ranks + else: + raise ValueError( + "The parallel_axis and group_ranks can not be None in the same." + ) + + if attrs is not None: + assert isinstance(attrs, dict) + op_attrs = attrs + else: + op_attrs = {} + + desc["attrs"] = op_attrs + desc["group_ranks"] = comm_group_ranks + + op_descs[rank_id] = desc + + return op_descs + + +def build_comm_desc(op_type, group_ranks, dtype, shape, attrs=None): + """Build a comm desc directly.""" + desc = {} + desc["op"] = op_type + desc["group_ranks"] = group_ranks + desc["inputs"] = {"X": [(dtype, shape)]} + desc["attrs"] = attrs + return desc + - return desc_str +def build_comm_costs_from_descs(op_cost_class, ctx, processes, descs, cluster): + """Build comm costs by descriptions""" + comm_context = CommContext(cluster) + group_ranks_list = [] + comm_op_cost_list = [] + for process in processes: + desc = descs[process] + group_ranks = desc["group_ranks"] + if group_ranks not in group_ranks_list: + group_ranks_list.append(group_ranks) + comm_op_cost = op_cost_class(op_desc=desc, + comm_context=comm_context) + comm_op_cost_list.append(comm_op_cost) + return comm_op_cost_list + + +def build_comp_costs_from_descs(op_cost_class, ctx, processes, descs, cluster): + """Build comp costs by descriptions.""" + costs = {} + for process in processes: + costs[process] = op_cost_class(op_desc=descs[process], cluster=cluster) + return costs + + +def build_dp_costs(result, dist_op, ctx, var_names, attrs, parallel_axis, + cluster): + """DP cost contains a allreduce_sum op cost and a scale op cost""" + # The costs will be appended in the given result. + from ..reshard import get_var_with_recursion + + dist_attr = dist_op.dist_attr + process_mesh = dist_attr.process_mesh + processes = process_mesh.processes + assert len(var_names) == 1 + vars = dist_op.serial_op.block.vars + var_name = var_names[0] + has_found = False + for name in dist_op.serial_op.input_arg_names: + if var_name in name: + var_name = name + has_found = True + break + + if not has_found: + for name in dist_op.serial_op.output_arg_names: + if var_name in name: + var_name = name + has_found = True + break + if not has_found: + return + + c_allreduce_sum_descs = build_comm_desc_from_dist_op( + "c_allreduce_sum", + dist_op, + ctx, + var_names, + attrs=attrs, + parallel_axis=parallel_axis) + comm_cost_list = build_comm_costs_from_descs( + _g_op_cost_factory["c_allreduce_sum"], ctx, processes, + c_allreduce_sum_descs, cluster) + result.append(comm_cost_list) + + # The scale op just on the group_ranks + for comm_cost in comm_cost_list: + group_ranks = comm_cost.group_ranks + dp_degree = len(group_ranks) + scale_costs = {} + op_type = "scale" + for rank in group_ranks: + desc = {} + desc["op"] = op_type + desc["inputs"] = {} + dims_mapping = dist_attr.get_input_dims_mapping( + var_name) if dist_attr.get_input_dims_mapping( + var_name + ) is not None else dist_attr.get_output_dims_mapping(var_name) + var = get_var_with_recursion(var_name, dist_op.serial_op.block, + dist_op.serial_op.block.program) + global_sizes = var.shape + shard_sizes = None + topology = process_mesh.topology + shape = DistributedTensor.get_local_sizes(global_sizes, + dims_mapping, topology, + processes, rank, + shard_sizes) + desc["inputs"]["X"] = [(var.dtype, shape)] + attrs = {"scale": 1.0 / dp_degree} + desc["attrs"] = attrs + scale_op_cost = _g_op_cost_factory["scale"](op_desc=desc, + cluster=cluster) + scale_costs[rank] = scale_op_cost + result.append(scale_costs) class CommContext: @@ -174,6 +443,8 @@ def _post_init(self): # set default self.base_ring = 8.4 self.base_tree = 0. + # self.base_inter_ring = 9.6 + # self.base_inter_tree = 28 # NVL in default self.intra_ring = 3.4 self.intra_tree = 28 @@ -441,6 +712,8 @@ def comm_context(self): @property def comm_count(self): + from ..reshard import get_var_with_recursion + if self._comm_count is None: dtype = None shape = None @@ -448,7 +721,8 @@ def comm_count(self): vars = self.op.block.vars # NOTE: The tensor communicated input_name is "X" in default. Otherwise, this function should be overrided var_name = self.op.input("X")[0] - var = vars[var_name] + var = get_var_with_recursion(var_name, self.op.block, + self.program) dtype = var.dtype shape = var.shape elif self.op_desc is not None: @@ -464,9 +738,10 @@ def comm_count(self): factor = 1 elif dtype == paddle.float16: factor = 2 + elif dtype == paddle.bool: + factor = 8 else: - raise TypeError( - "This dtype {} is not supported now".format(dtype)) + raise ValueError("Unsupported comm dtype {}".format(dtype)) comm_count = reduce(lambda x, y: x * y, shape) * factor self._comm_count = comm_count diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt index 5738412dd52ae..6c51ce1fffae3 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt @@ -51,6 +51,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS}) py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS}) py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS}) + py_test_modules(test_base_cost MODULES test_base_cost ENVS ${dist_ENVS}) py_test_modules(test_dist_context MODULES test_dist_context ENVS ${dist_ENVS}) py_test_modules(test_prim_dist_op MODULES test_prim_dist_op ENVS ${dist_ENVS}) py_test_modules(test_to_static MODULES test_to_static ENVS ${dist_ENVS}) diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py new file mode 100644 index 0000000000000..0fbe4f5bd3d09 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py @@ -0,0 +1,234 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +import json +import tempfile + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer +from paddle.distributed.auto_parallel.dist_context import DistributedContext +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.reshard import Resharder +from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr +from paddle.distributed.auto_parallel.cluster import Cluster +from paddle.distributed.auto_parallel.cost import CommContext +from paddle.distributed.auto_parallel.cost.base_cost import build_comp_desc_from_dist_op +from paddle.distributed.auto_parallel.cost.base_cost import build_comm_desc_from_dist_op +from paddle.distributed.auto_parallel.cost.base_cost import build_comm_costs_from_descs +from paddle.distributed.auto_parallel.cost.base_cost import build_comp_costs_from_descs +from paddle.distributed.auto_parallel.cost.base_cost import build_dp_costs +from paddle.distributed.auto_parallel.cost import AllreduceSumOpCost +from paddle.distributed.auto_parallel.cost import _g_op_cost_factory +from test_cluster import cluster_json + +paddle.enable_static() +_global_parallel_strategy = "dp_mp_pp" +_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]]) +PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]]) +PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]]) + + +class MLPLayer(nn.Layer): + + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr( + initializer=nn.initializer.Normal(mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear(d_model, + dim_feedforward, + weight_attr, + bias_attr=bias_attr) + self.linear1 = nn.Linear(dim_feedforward, + d_model, + weight_attr, + bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + + def forward(self, input): + auto.shard_tensor(self.linear0.weight, + dist_attr={ + "process_mesh": PP_MESH_0, + "dims_mapping": [-1, 1] + }) + auto.shard_tensor(self.linear1.weight, + dist_attr={ + "process_mesh": PP_MESH_1, + "dims_mapping": [1, -1] + }) + + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + + return out + + +def mlp_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data(name="input", + shape=[batch_size, hidden_size], + dtype='float32') + label = static.data(name="label", + shape=[batch_size, 1], + dtype='float32') + + fill_constant_out = paddle.fluid.layers.fill_constant_batch_size_like( + input=input, shape=[batch_size], value=1, dtype="int32") + embedding = paddle.nn.Embedding(10, hidden_size, sparse=True) + embedding_out = embedding(fill_constant_out) + + auto.shard_tensor(input, + dist_attr={ + "process_mesh": PP_MESH_0, + "dims_mapping": [0, -1] + }) + auto.shard_tensor(label, + dist_attr={ + "process_mesh": PP_MESH_1, + "dims_mapping": [0, -1] + }) + + mlp = MLPLayer(hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02) + + predict = mlp(embedding_out) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_prog(train_program, startup_program, dist_context, rank_id): + global _global_process_mesh + dist_context.process_mesh = _global_process_mesh + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + fleet._user_defined_strategy = fleet.DistributedStrategy() + fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer() + parallelizer = AutoParallelizer(fleet) + parallelizer._dist_context = dist_context + + # serial forward & backward completion + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) + dist_context.block_state.parse_forward_blocks(complete_train_program) + params_grads = parallelizer._generate_backward(complete_train_program, + startup_program, + loss, + parameter_list=None, + no_grad_set=None, + callbacks=None) + return train_program, startup_program, params_grads + + +class TestBaseCost(unittest.TestCase): + + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + + def test_base_cost(self): + # Build cluster + cluster_json_path = os.path.join(self.temp_dir.name, + "auto_parallel_cluster.json") + cluster_json_object = json.loads(cluster_json) + with open(cluster_json_path, "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + cluster = Cluster() + cluster.build_from_file(cluster_json_path) + + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 2 + train_program, startup_program, params_grads = get_prog( + train_program, startup_program, dist_context, rank_id) + + for op in train_program.global_block().ops: + dist_op = dist_context.get_dist_op_for_program(op) + if dist_op: + processes = dist_op.dist_attr.process_mesh.processes + comp_descs = build_comp_desc_from_dist_op(dist_op, dist_context) + self.assertTrue(isinstance(comp_descs, dict) and comp_descs) + var_names = None + if op.input_arg_names: + var_names = op.input_arg_names[0] + comm_descs = build_comm_desc_from_dist_op("c_allreduce_sum", + dist_op, + dist_context, + var_names, + attrs=None, + parallel_axis=0, + group_ranks=None) + self.assertTrue(isinstance(comm_descs, dict) and comm_descs) + comm_descs = build_comm_desc_from_dist_op( + "c_allreduce_sum", + dist_op, + dist_context, + var_names, + attrs=None, + parallel_axis=None, + group_ranks=processes) + self.assertTrue(isinstance(comm_descs, dict) and comm_descs) + + comm_costs = build_comm_costs_from_descs( + AllreduceSumOpCost, dist_context, processes, comm_descs, + cluster) + self.assertTrue(comm_costs) + + comp_costs = build_comp_costs_from_descs( + _g_op_cost_factory[op.type], dist_context, processes, + comp_descs, cluster) + self.assertTrue(comp_costs) + + result = [] + build_dp_costs(result, dist_op, dist_context, var_names[0], + None, 0, cluster) + self.assertTrue(result) + + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py index dd9b0110dbebd..641ca38b64944 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py @@ -2018,6 +2018,10 @@ def test_multi_machine(self): self.assertTrue(devices == [5, 6, 7, 10]) self.assertTrue(involved_machine_count == 2) + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py index 215385787880c..5744cf6d39206 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py @@ -154,6 +154,10 @@ def test_cross_machine_comm_cost(self): comm_context=comm_context) self.assertTrue(recv_op_cost.time > 0) + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py index fe46131225759..6b0db61b984c5 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py @@ -19,8 +19,8 @@ import paddle import paddle.distributed.auto_parallel.cost as cost_model -from paddle.distributed.auto_parallel.cost.base_cost import parse_to_desc -from paddle.distributed.auto_parallel.cost.base_cost import parse_desc_to_str +from paddle.distributed.auto_parallel.cost.base_cost import build_comp_desc_from_op +from paddle.distributed.auto_parallel.cost.base_cost import build_comp_desc_str_for_predict from paddle.distributed.auto_parallel.cost.base_cost import calc_time_by_modeling from paddle.distributed.auto_parallel.cluster import Cluster from paddle.distributed.auto_parallel.cost import CommContext @@ -60,8 +60,8 @@ def test_comp_cost(self): break matmul_v2_cost = cost_model._g_op_cost_factory["matmul_v2"]( op=matmul_v2_op) - desc = parse_to_desc(op=matmul_v2_op) - desc_str = parse_desc_to_str(desc) + desc = build_comp_desc_from_op(op=matmul_v2_op) + desc_str = build_comp_desc_str_for_predict(desc) self.assertIsNotNone(desc_str) self.assertTrue(check_cost(matmul_v2_cost.cost)) time = calc_time_by_modeling(op=matmul_v2_op) @@ -92,11 +92,29 @@ def test_comm_cost(self): op_desc=desc, comm_context=CommContext(cluster)) self.assertTrue(check_cost(allreduce_cost.cost)) + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + def test_cost_estimator(self): + # Build cluster + cluster_json_path = os.path.join(self.temp_dir.name, + "auto_parallel_cluster.json") + cluster_json_object = json.loads(cluster_json) + with open(cluster_json_path, "w") as cluster_json_file: + json.dump(cluster_json_object, cluster_json_file) + cluster = Cluster() + cluster.build_from_file(cluster_json_path) + train_program = paddle.static.Program() - cost_estimator = cost_model.CostEstimator(train_program) + cost_estimator = cost_model.CostEstimator(train_program, + cluster=cluster) self.assertIsNotNone(cost_estimator) + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + if __name__ == "__main__": unittest.main() From 8759c78d8e9bd4e7361f0fbff8febc51a84709c5 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Tue, 12 Jul 2022 19:19:33 +0800 Subject: [PATCH 149/250] Fix select input error when handle 0 dim Tensor (#44251) --- paddle/fluid/operators/assign_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h index e90de55bbbd90..e6374eb3a66f7 100644 --- a/paddle/fluid/operators/assign_op.h +++ b/paddle/fluid/operators/assign_op.h @@ -71,7 +71,7 @@ class AssignFunctor { private: void copy_tensor(const framework::LoDTensor &lod_tensor, framework::LoDTensor *out) const { - if (lod_tensor.numel() == 0) return; + if (!lod_tensor.IsInitialized()) return; auto &out_tensor = *out; paddle::framework::TensorCopy(lod_tensor, lod_tensor.place(), &out_tensor); out_tensor.set_lod(lod_tensor.lod()); From c5c6026e128017af59a8a908c1ee10fc6f37240d Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 12 Jul 2022 19:32:33 +0800 Subject: [PATCH 150/250] [ Dy2Static ]Change NameVisitor in while to FunctionScopeAnalysis (#44155) * change NameVisitor to FunctionScopeAnalysis * polish the logic of undefined var in while_loop. create vars after body execution * replace old NameVisitor in while and fix all CI * Togather with CreateVariableTransformer * add create_variable_transformer * fix bugs * merge * fix some error, TODO: ForNodePreTransform ahead * merge for unite PR * fix conflict with base_transformer PR * fix ci errors, fix [for i in range()] error * fix according to code review --- .../dygraph_to_static/ast_transformer.py | 3 +- .../dygraph_to_static/base_transformer.py | 169 ++++++------------ .../dygraph_to_static/call_transformer.py | 15 +- .../dygraph_to_static/convert_call_func.py | 34 ++-- .../dygraph_to_static/convert_operators.py | 104 ++++++++++- .../create_variable_transformer.py | 48 +++++ .../dygraph_to_static/ifelse_transformer.py | 23 ++- .../dygraph_to_static/loop_transformer.py | 58 ++---- .../fluid/dygraph/dygraph_to_static/utils.py | 19 ++ python/paddle/fluid/layers/control_flow.py | 43 ++++- .../seq2seq_dygraph_model.py | 1 + .../unittests/dygraph_to_static/test_loop.py | 7 - .../test_program_translator.py | 12 +- .../dygraph_to_static/test_tensor_shape.py | 6 +- .../transformer_dygraph_model.py | 4 +- python/paddle/jit/dy2static/__init__.py | 3 +- .../paddle/jit/dy2static/convert_operators.py | 1 + 17 files changed, 337 insertions(+), 213 deletions(-) create mode 100644 python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py index f1ab097758b71..a9e8f447e998c 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py @@ -35,6 +35,7 @@ from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import LoopTransformer from paddle.fluid.dygraph.dygraph_to_static.print_transformer import PrintTransformer from paddle.fluid.dygraph.dygraph_to_static.return_transformer import ReturnTransformer +from paddle.fluid.dygraph.dygraph_to_static.create_variable_transformer import CreateVariableTransformer from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor from paddle.fluid.dygraph.dygraph_to_static.tensor_shape_transformer import TensorShapeTransformer @@ -96,7 +97,7 @@ def transfer_from_node_type(self, node_wrapper): BreakContinueTransformer, # break/continue in loops ReturnTransformer, # return in functions LogicalTransformer, # logical and/or/not - #CreateVariableTransformer, # create undefined var for if / while / for + CreateVariableTransformer, # create undefined var for if / while / for LoopTransformer, # for/while -> while_op IfElseTransformer, # if/else -> cond_op AssertTransformer, # assert statement diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py index a3c2c0c69efaf..9df7e8d9b4f41 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/base_transformer.py @@ -24,6 +24,8 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_VAR_LEN_PREFIX from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_VAR_NAME_PREFIX from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_ZIP_TO_LIST_PREFIX +from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_TARGET_PREFIX +from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_ITERATOR_PREFIX class BaseTransformer(gast.NodeTransformer): @@ -119,32 +121,20 @@ def replace(s): class ForLoopTuplePreTransformer(BaseTransformer): - """ - ForNodeVisitor parses 3 type statements (Here var is VarBase(Tensor) or python variable): - 1). for x in range(var[*]|var.numpy()[*]) - 2). for x in var|var.numpy() - 3). for i, x in enumerate(var|var.numpy()) - - We chose these 3 types because they are easier (x can be variable name iterating in var). - However, users can write tuples in Python for loop, such as - 1). for var1, var2 in var|var.numpy() - 2). for t in enumerate(var|var.numpy()) - 2). for i, (var1, var2, va3) in enumerate(var|var.numpy()) - - To handle these case, this method will do the rewrite tuple pre-process: - 1). Non-enumerate case: for var1, var2 in var|var.numpy() will be re-written as: - for FOR_ITER_TUPLE_PREFIX_x in var | var.numpy(): - var1 = FOR_ITER_TUPLE_PREFIX_x[0] - var2 = FOR_ITER_TUPLE_PREFIX_x[1] - 2). Enumerate out tuple case: for t in enumerate(var|var.numpy) will be rewritten as: - for FOR_ITER_TUPLE_INDEX_PREFIX_x, FOR_ITER_TUPLE_PREFIX_x in enumerate(var|var.numpy): - t = (FOR_ITER_TUPLE_INDEX_PREFIX_x, FOR_ITER_TUPLE_PREFIX_x) - 3). Enumerate inner tuple case: for i, (var1, (var2, va3)) in enumerate(var|var.numpy()) will - be re-written as: - for i, FOR_ITER_TUPLE_PREFIX_x in var | var.numpy(): - var1 = FOR_ITER_TUPLE_PREFIX_x[0] - var2 = FOR_ITER_TUPLE_PREFIX_x[1][0] - var3 = FOR_ITER_TUPLE_PREFIX_x[1][1] + """ pre-process of for loop. + >>> for A in B: + >>> C + + will be changed into : + + >>> UUID_iterator = _jst.Indexable(B) # make iterator-only to indexable list. + >>> for UUID_target in UUID_iterator: + >>> A = _jst.Unpack(UUID_target, structure) + >>> C + + make the later loop_transform have unified type: + >>> for target in iter: + >>> body """ def __init__(self, wrapper_root): @@ -155,104 +145,45 @@ def transform(self): self.visit(self.root) def visit_For(self, node): - if self.is_for_enumerate_iter(node): - if isinstance(node.target, (gast.Name, gast.Attribute)): - # Out tuple case - out_tuple_name = ast_to_source_code(node.target).strip() - tuple_iter_name = unique_name.generate( - FOR_ITER_TUPLE_INDEX_PREFIX) - tuple_var_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX) - node.target = gast.Tuple(elts=[ - gast.Name(id=tuple_iter_name, - ctx=gast.Store(), - annotation=None, - type_comment=None), - gast.Name(id=tuple_var_name, - ctx=gast.Store(), + self.generic_visit(node) + tuple_target = unique_name.generate(FOR_ITER_TARGET_PREFIX) + tuple_iterator = unique_name.generate(FOR_ITER_ITERATOR_PREFIX) + origin_tuple_node = node.target + assign_iterator_node = gast.parse( + f"{tuple_iterator} = _jst.Indexable({ast_to_source_code(node.iter).strip()})" + ).body[0] + node.target = gast.Name(id=tuple_target, + ctx=gast.Store(), + annotation=None, + type_comment=None) + node.iter = gast.Name(id=tuple_iterator, + ctx=gast.Load(), annotation=None, type_comment=None) - ], - ctx=gast.Store()) - node.body.insert( - 0, - gast.Assign(targets=[ - gast.Name(id=out_tuple_name, - ctx=gast.Store(), - annotation=None, - type_comment=None) - ], - value=gast.Tuple(elts=[ - gast.Name(id=tuple_iter_name, - ctx=gast.Load(), - annotation=None, - type_comment=None), - gast.Name(id=tuple_var_name, - ctx=gast.Load(), - annotation=None, - type_comment=None) - ], - ctx=gast.Load()))) - elif isinstance(node.target, (gast.List, gast.Tuple)) and len( - node.target.elts) >= 2 and isinstance( - node.target.elts[1], (gast.List, gast.Tuple)): - # Inner tuple case - inner_tuple_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX) - origin_inner_tuple_node = node.target.elts[1] - node.target.elts[1] = gast.Name(id=inner_tuple_name, - ctx=gast.Store(), - annotation=None, - type_comment=None) - node.body[0:0] = self.tuple_to_stmts(origin_inner_tuple_node, - inner_tuple_name) - elif self.is_for_iter(node) and isinstance(node.target, - (gast.List, gast.Tuple)): - # Non-enumrate case: - tuple_name = unique_name.generate(FOR_ITER_TUPLE_PREFIX) - origin_tuple_node = node.target - node.target = gast.Name(id=tuple_name, - ctx=gast.Store(), - annotation=None, - type_comment=None) - node.body[0:0] = self.tuple_to_stmts(origin_tuple_node, tuple_name) - return node - - def tuple_to_stmts(self, node, tuple_name, idx=[]): - if not isinstance(node, (gast.Tuple, gast.List)): - value_node_str = tuple_name - for i in idx: - value_node_str = value_node_str + "[{}]".format(i) - - node_str = ast_to_source_code(node).strip() - assign_node_str = "{} = {}".format(node_str, value_node_str) - assign_node = gast.parse(assign_node_str).body[0] - return [assign_node] - - # isinstance(node, (gast.Tuple, gast.List)) + node.body[0:0] = self.tuple_to_stmts(origin_tuple_node, tuple_target) + # return a list will insert a list of node replace the original for node. + return [assign_iterator_node, node] + + def tuple_node_to_unpack_structure(self, node): + """ Create a sequence to represents the structure of nest. + For example: `a, (b,c), [d,e,f]` is represented by + `[1, [1,1], [1,1,1]]`. the `1` is just a notation. + + Specially, `a` is represented by `1`. + """ ret = [] - for i, element in enumerate(node.elts): - ret += self.tuple_to_stmts(node.elts[i], tuple_name, idx + [i]) + if not isinstance(node, (gast.Tuple, gast.List)): + return 1 + for element in node.elts: + ret.append(self.tuple_node_to_unpack_structure(element)) return ret - def is_for_iter(self, for_node): - assert isinstance(for_node, - gast.For), "Input node is not gast.For node." - if isinstance(for_node.iter, (gast.Name, gast.Attribute)): - return True - elif isinstance(for_node.iter, gast.Call) and isinstance( - for_node.iter.func, - gast.Attribute) and for_node.iter.func.attr == 'numpy': - return True - elif isinstance(for_node.iter, gast.Subscript): - return True - else: - return False - - def is_for_enumerate_iter(self, for_node): - assert isinstance(for_node, - gast.For), "Input node is not gast.For node." - return isinstance(for_node.iter, gast.Call) and isinstance( - for_node.iter.func, - gast.Name) and for_node.iter.func.id == "enumerate" + def tuple_to_stmts(self, node, tuple_name): + structure_str = str(self.tuple_node_to_unpack_structure(node)) + node_str = ast_to_source_code(node).strip() + assign_node_str = f"{node_str} = _jst.Unpack({tuple_name}, {structure_str})" + assign_node = gast.parse(assign_node_str).body[0] + return [assign_node] class SplitAssignTransformer(BaseTransformer): diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py index c9f56287ed3c5..15b909f3d3d84 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py @@ -40,7 +40,7 @@ def _no_need_convert_call(self, node): Determines whether a function needs to be transformed by `convert_call`. It doesn't need to be transformed when a function satisfies the following conditions: 1. It's a api of paddle - 2. It's a python builtin function not include `len` and `zip` + 2. It's a python builtin function not include `len`, `zip`, `range` and `enumerate` """ assert isinstance(node, gast.Call) if is_paddle_api(node): @@ -48,11 +48,16 @@ def _no_need_convert_call(self, node): func_str = ast_to_source_code(node.func).strip() try: - from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import is_builtin_len, is_builtin, is_builtin_zip + from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import is_builtin + need_convert_builtin_func_list = { + 'len', + 'zip', + 'range', + 'enumerate', + } is_builtin = eval("is_builtin({})".format(func_str)) - is_builtin_len = eval("is_builtin_len({})".format(func_str)) - is_builtin_zip = eval("is_builtin_zip({})".format(func_str)) - return is_builtin and not is_builtin_len and not is_builtin_zip + need_convert = func_str in need_convert_builtin_func_list + return is_builtin and not need_convert except Exception: return False diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py index e660a64ab363c..5bb75bda8de97 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py @@ -28,6 +28,7 @@ from paddle.fluid.dygraph.container import Sequential from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len, convert_zip +from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_range, convert_enumerate from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static @@ -64,25 +65,22 @@ def __init__(self, not_convert=False): self.not_convert = not_convert -def is_builtin(func): - if isinstance(func, types.BuiltinFunctionType): +def is_builtin(func, name=None): + """ predict whether a function is a builtin function with name={name}. + if name == None, then any builtin function will return True + """ + + def name_judge(): + return name is None or func.__name__ == name + + if isinstance(func, types.BuiltinFunctionType) and name_judge(): return True - elif func in six.moves.builtins.__dict__.values(): + elif func in six.moves.builtins.__dict__.values() and name_judge(): return True else: return False -def is_builtin_len(func): - if isinstance(func, types.BuiltinFunctionType) and func.__name__ == 'len': - return True - return False - - -def is_builtin_zip(func): - return is_builtin(func) and func.__name__ == 'zip' - - def is_unsupported(func): """ Checks whether the func is supported by dygraph to static graph. @@ -165,12 +163,18 @@ def dyfunc(x): .format(func)) return func - if is_builtin_len(func): + if is_builtin(func, "len"): return convert_len - if is_builtin_zip(func): + if is_builtin(func, "zip"): return convert_zip + if is_builtin(func, "range"): + return convert_range + + if is_builtin(func, "enumerate"): + return convert_enumerate + if is_builtin(func) or is_unsupported(func): return func diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py index 583db5c0dcdba..e0b46fe2341a3 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py @@ -13,11 +13,12 @@ # limitations under the License. import re - +import paddle from paddle.fluid.data_feeder import convert_dtype from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable from paddle.fluid.framework import core, Variable from paddle.fluid.layers import Assert, Print +from paddle.fluid.layers import range as paddle_range from paddle.fluid.layers import array_length, array_read, array_write, create_array from paddle.fluid.layers import assign, fill_constant, slice, reduce_all, reduce_any from paddle.fluid.layers import cast, control_flow, logical_and, logical_not, logical_or, nn @@ -26,6 +27,45 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, Dygraph2StaticException +def indexable(x, code=None): + if isinstance(x, Variable): return x + if hasattr(x, '__len__') and hasattr(x, '__getitem__'): return x + if hasattr(x, '__iter__'): + return [i for i in x] + else: + raise RuntimeError("X can't be convert into indexable.") + + +def unpack_by_structure(target, structure): + """ unified unpack interface for paddle and python. + """ + if isinstance(target, Variable): + return _unpack_by_structure_paddle(target, structure) + else: + return _unpack_by_structure_python(target, structure) + + +def _unpack_by_structure_python(target, structure): + """ TODO(xiongkun): analysis the differences between python and paddle unpack. + """ + return _unpack_by_structure_paddle(target, structure) + + +def _unpack_by_structure_paddle(target, structure): + if structure == 1: + return target + ret = [] + for idx, ele in enumerate(structure): + if ele == 1: + ret.append(target[idx]) + continue + if isinstance(ele, list): + ret.append(unpack_by_structure(target[idx], ele)) + continue + assert False, "structure element must be 1 or list" + return ret + + def convert_while_loop(cond, body, getter, setter): """ A function representation of a Python ``while`` statement. @@ -50,12 +90,26 @@ def convert_while_loop(cond, body, getter, setter): def _run_paddle_while(cond, body, getter, setter): # NOTE: loop_vars of Paddle op `control_flow.while_loop` must be Paddle Tensors. - - # UndefinedVar will become data layer not check. - loop_vars = [to_static_variable(var) for var in getter()] + def new_body_fn(*args): + """ wrap the body() and add return value for `while_loop` + """ + body() + return getter() + + def new_cond_fn(*args): + """ cond is a zero-args function, which is not + compatible with `while_loop`. + """ + return cond() + + # UndefinedVar will become data layer not check variable with value=NO_VALUE_MAGIC. + loop_vars = [ + to_static_variable(var) if not isinstance(var, UndefinedVar) else var + for var in getter() + ] setter(loop_vars) # change the non-local var to variable # variable maybe modified to inner var. change it into - loop_vars = control_flow.while_loop(cond, body, loop_vars) + loop_vars = control_flow.while_loop(new_cond_fn, new_body_fn, loop_vars) setter(loop_vars) # change the non-local var to variable return loop_vars @@ -368,6 +422,8 @@ def convert_len(var): 'len(var) only supports LoDTensor/LoDTensorArray/SelectedRows, but received %s.' % type(var)) else: + if isinstance(var, VariableTuple): + return var.__len__() return len(var) @@ -380,6 +436,44 @@ def convert_zip(*args): return zip(*args) +# TODO(xiongkun): delete when list is ready. +class VariableTuple: + """ + this class will cause enumerate can't be wrapped by other iterator change function. + this will be fixed when list is producted. + VariableTuple can only deal with variables which is fixed. + """ + + def __init__(self, var, start=0): + self.var = var + self.len = convert_len(var) + self.rag = paddle_range(start, start + self.len, 1, paddle.int64) + + def __getitem__(self, idx): + return self.rag[idx], self.var[idx] + + def __len__(self): + return self.len + + +def convert_enumerate(*args): + has_variable = any(map(lambda x: isinstance(x, Variable), args)) + if has_variable: + return VariableTuple(*args) + return enumerate(*args) + + +def convert_range(*args): + has_variable = any(map(lambda x: isinstance(x, Variable), args)) + if has_variable: + if len(args) == 1: return paddle_range(0, args[0], 1, paddle.int64) + if len(args) == 2: + return paddle_range(args[0], args[1], 1, paddle.int64) + if len(args) == 3: + return paddle_range(args[0], args[1], args[2], paddle.int64) + return range(*args) + + def convert_shape(x): """ A function representation of the shape of variable. diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py new file mode 100644 index 0000000000000..8ae4c12eb8eaf --- /dev/null +++ b/python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py @@ -0,0 +1,48 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from paddle.utils import gast +from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper +from paddle.fluid.dygraph.dygraph_to_static.utils import FunctionNameLivenessAnalysis +from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var +from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer + + +class CreateVariableTransformer(BaseTransformer): + """ + """ + + def __init__(self, wrapper_root): + assert isinstance( + wrapper_root, AstNodeWrapper + ), "Type of input node should be AstNodeWrapper, but received %s ." % type( + wrapper_root) + self.root = wrapper_root.node + FunctionNameLivenessAnalysis(self.root) + + def transform(self): + """ + Main function to transform AST. + """ + self.visit(self.root) + + def visit_FunctionDef(self, node): + #attributes = set(filter(lambda x: '.' in x, node.pd_scope.modified_vars())) + bodys = node.body + names = sorted(node.pd_scope.created_vars()) + for name in names: + bodys[0:0] = [create_undefined_var(name)] + return node diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py index a65e86f8e82fd..07d4920d43344 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py @@ -34,6 +34,7 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_nodes from paddle.fluid.dygraph.dygraph_to_static.utils import create_get_args_node, create_set_args_node from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer +from paddle.fluid.dygraph.dygraph_to_static.utils import FOR_ITER_INDEX_PREFIX, FOR_ITER_TUPLE_PREFIX, FOR_ITER_TUPLE_INDEX_PREFIX, FOR_ITER_VAR_LEN_PREFIX, FOR_ITER_VAR_NAME_PREFIX, FOR_ITER_ZIP_TO_LIST_PREFIX, FOR_ITER_TARGET_PREFIX, FOR_ITER_ITERATOR_PREFIX TRUE_FUNC_PREFIX = 'true_fn' FALSE_FUNC_PREFIX = 'false_fn' @@ -304,7 +305,6 @@ def transform_if_else(node, root): """ # TODO(liym27): Consider variable like `self.a` modified in if/else node. - new_vars_to_create = sorted(list(node.pd_scope.created_vars())) return_name_ids = sorted(list(node.pd_scope.modified_vars())) # NOTE: Python can create variable only in if body or only in else body, and use it out of if/else. # E.g. @@ -315,10 +315,6 @@ def transform_if_else(node, root): # # Create static variable for those variables create_new_vars_in_parent_stmts = [] - for name in new_vars_to_create: - # NOTE: Consider variable like `self.a` modified in if/else node. - if "." not in name: - create_new_vars_in_parent_stmts.append(create_undefined_var(name)) nonlocal_names = list(return_name_ids) nonlocal_names.sort() @@ -326,8 +322,21 @@ def transform_if_else(node, root): nonlocal_names = _valid_nonlocal_names(return_name_ids, nonlocal_names) # TODO(dev): Need a better way to deal this. - if ARGS_NAME in nonlocal_names: - nonlocal_names.remove(ARGS_NAME) + # LoopTransformer will create some special vars, which is not visiable by users. so we can sure it's safe to remove them. + filter_names = [ + ARGS_NAME, FOR_ITER_INDEX_PREFIX, FOR_ITER_TUPLE_PREFIX, + FOR_ITER_TARGET_PREFIX, FOR_ITER_ITERATOR_PREFIX, + FOR_ITER_TUPLE_INDEX_PREFIX, FOR_ITER_VAR_LEN_PREFIX, + FOR_ITER_VAR_NAME_PREFIX, FOR_ITER_ZIP_TO_LIST_PREFIX + ] + + def remove_if(x): + for name in filter_names: + if x.startswith(name): return False + return True + + nonlocal_names = list(filter(remove_if, nonlocal_names)) + return_name_ids = nonlocal_names nonlocal_stmt_node = create_nonlocal_stmt_nodes(nonlocal_names) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index 29ac905074e1d..099f669748035 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -26,8 +26,8 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import generate_name_node from paddle.fluid.dygraph.dygraph_to_static.utils import get_attribute_full_name from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_undefined_var -from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node from paddle.fluid.dygraph.dygraph_to_static.utils import create_nonlocal_stmt_nodes, create_get_args_node, create_set_args_node +from paddle.fluid.dygraph.dygraph_to_static.utils import FunctionNameLivenessAnalysis from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import ARGS_NAME from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer from paddle.fluid.dygraph.dygraph_to_static.base_transformer import RenameTransformer @@ -483,10 +483,10 @@ def __init__(self, wrapper_root): ), "Input non-AstNodeWrapper node for the initialization of LoopTransformer." self.wrapper_root = wrapper_root self.root = wrapper_root.node + FunctionNameLivenessAnalysis(self.root) def transform(self): ForLoopTuplePreTransformer(self.wrapper_root).transform() - self.name_visitor = NameVisitor(self.root) self.visit(self.root) def visit_While(self, node): @@ -537,19 +537,19 @@ def get_for_stmt_nodes(self, node): return [node] init_stmts, cond_stmt, body_stmts = stmts_tuple # 2. get original loop vars - loop_var_names, create_var_names = self.name_visitor.get_loop_var_names( - node) + loop_var_names, create_var_names = node.pd_scope.modified_vars( + ), node.pd_scope.created_vars() + # TODO: Remove the bunch of code? We have the unique format `for A in B:` # NOTE: in 'for x in var' or 'for i, x in enumerate(var)' cases, # we need append new loop var & remove useless loop var # 1. for x in var -> x is no need # 2. for i, x in enumerate(var) -> x is no need - if current_for_node_parser.is_for_iter( - ) or current_for_node_parser.is_for_enumerate_iter(): + if current_for_node_parser.is_for_iter(): iter_var_name = current_for_node_parser.iter_var_name iter_idx_name = current_for_node_parser.iter_idx_name loop_var_names.add(iter_idx_name) - if iter_var_name not in create_var_names: - loop_var_names.remove(iter_var_name) + if current_for_node_parser.enum_idx_name is not None: + loop_var_names.add(current_for_node_parser.enum_idx_name) # 3. prepare result statement list new_stmts = [] @@ -559,10 +559,8 @@ def get_for_stmt_nodes(self, node): # y += x # print(x) # x = 10 # - # We need to create static variable for those variables - for name in create_var_names: - if "." not in name: - new_stmts.append(create_undefined_var(name)) + # We don't need to create static variable for them, because + # we do this in CreateUndefinedVarTransformer # create non-local statement for body and cond. nonlocal_names = list(loop_var_names | create_var_names) @@ -581,10 +579,7 @@ def get_for_stmt_nodes(self, node): name=unique_name.generate(FOR_CONDITION_PREFIX), args=gast.arguments(args=[], posonlyargs=[], - vararg=gast.Name(id=ARGS_NAME, - ctx=gast.Param(), - annotation=None, - type_comment=None), + vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, @@ -597,17 +592,11 @@ def get_for_stmt_nodes(self, node): # 6. create & append loop body function node # append return values for loop body - body_stmts.append( - gast.Return(value=generate_name_node( - nonlocal_names, ctx=gast.Load(), gen_tuple_if_single=True))) body_func_node = gast.FunctionDef( name=unique_name.generate(FOR_BODY_PREFIX), args=gast.arguments(args=[], posonlyargs=[], - vararg=gast.Name(id=ARGS_NAME, - ctx=gast.Param(), - annotation=None, - type_comment=None), + vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, @@ -632,8 +621,8 @@ def get_for_stmt_nodes(self, node): return new_stmts def get_while_stmt_nodes(self, node): - loop_var_names, create_var_names = self.name_visitor.get_loop_var_names( - node) + loop_var_names, create_var_names = node.pd_scope.modified_vars( + ), node.pd_scope.created_vars() new_stmts = [] # create non-local statement for body and cond. @@ -652,19 +641,14 @@ def get_while_stmt_nodes(self, node): # y = x # z = y # - # We need to create static variable for those variables - for name in create_var_names: - if "." not in name: - new_stmts.append(create_fill_constant_node(name)) + # We don't need to create static variable for those variables, because + # we do this in CreateUndefinedVarTransformer condition_func_node = gast.FunctionDef( name=unique_name.generate(WHILE_CONDITION_PREFIX), args=gast.arguments(args=[], posonlyargs=[], - vararg=gast.Name(id=ARGS_NAME, - ctx=gast.Param(), - annotation=None, - type_comment=None), + vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, @@ -677,17 +661,11 @@ def get_while_stmt_nodes(self, node): new_stmts.append(condition_func_node) new_body = node.body - new_body.append( - gast.Return(value=generate_name_node( - nonlocal_names, ctx=gast.Load(), gen_tuple_if_single=True))) body_func_node = gast.FunctionDef( name=unique_name.generate(WHILE_BODY_PREFIX), args=gast.arguments(args=[], posonlyargs=[], - vararg=gast.Name(id=ARGS_NAME, - ctx=gast.Param(), - annotation=None, - type_comment=None), + vararg=None, kwonlyargs=[], kw_defaults=None, kwarg=None, diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 9f390252f3a2c..ed7faf83cefe5 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -82,6 +82,8 @@ def visit(self, node): FOR_ITER_INDEX_PREFIX = '__for_loop_var_index' FOR_ITER_TUPLE_PREFIX = '__for_loop_iter_tuple' +FOR_ITER_TARGET_PREFIX = '__for_loop_iter_target' +FOR_ITER_ITERATOR_PREFIX = '__for_loop_iter_iterator' FOR_ITER_TUPLE_INDEX_PREFIX = '__for_loop_iter_tuple_index' FOR_ITER_VAR_LEN_PREFIX = '__for_loop_var_len' FOR_ITER_VAR_NAME_PREFIX = '__for_loop_iter_var' @@ -1099,6 +1101,18 @@ def _nearest_function_scope(self): if isinstance(node, gast.FunctionDef): return self._get_name_scope(node) + def visit_ListComp(self, node): + """ [ i for i in range(10) ] + In this case, `i` will not created in FunctionScope. + We don't collect `i` by not calling generic_visit. + """ + pass + + def visit_DictComp(self, node): + """ the same as ListComp. + """ + pass + def visit_Name(self, node): self.generic_visit(node) write_context = (gast.Store, gast.AugStore, gast.Del) @@ -1149,8 +1163,13 @@ def _visit_controlflow_node(self, node): def post_func(): self._father_name_scope().merge_from(self._current_name_scope()) + self._nearest_function_scope().merge_from( + self._current_name_scope()) self._current_name_scope().created = self._nearest_function_scope( ).existed_vars() - node.before_created + # gather created vars into father and used in CreateUndefinedVarTransform + self._nearest_function_scope().created |= self._current_name_scope( + ).created def pre_func(): setattr(node, "before_created", diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index bc1a2c15dd3ac..d7b859612473f 100755 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -108,7 +108,6 @@ def select_input(inputs, mask): def select_input_with_buildin_type(inputs, mask): from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, create_undefined_var_like - support_ret_buildin_type = (bool, float, six.integer_types) false_var, true_var = inputs if isinstance(false_var, UndefinedVar) and isinstance( @@ -1182,12 +1181,16 @@ def _complete(self): }) +support_ret_buildin_type = (bool, float, six.integer_types) + + def assign_skip_lod_tensor_array(input, output): """ Assign input to output, but skip the process of copying LoDTensorArray unless it's created in while_block. """ if not isinstance(input, (Variable, core.VarBase)): - if isinstance(output, Variable): + if isinstance(output, Variable) and isinstance( + input, support_ret_buildin_type): assign(input, output) else: output = input @@ -1297,6 +1300,7 @@ def body(i, ten): if not isinstance(output_vars, (list, tuple)): output_vars = [output_vars] try: + loop_vars = _deal_with_undefined_var(output_vars, loop_vars) assert_same_structure(output_vars, loop_vars, check_types=False) except ValueError as e: raise ValueError( @@ -1308,6 +1312,36 @@ def body(i, ten): return loop_vars +def _deal_with_undefined_var(output_vars, loop_vars): + """ Deal with undefined var cases, We create undefined variable based on the results of body(). + In Dy2Static, we use undefined var to represent the var created in control flow. This function + expand the loop_vars and replace original loop_vars. + 1. UndefinedVar = Variable # create a variable + 2. UndefinedVar = None # create a undefined var with RETURN_NO_VALUE_MAGIC_NUM + 3. UndefinedVar = List(int) # create a list of variable + 4. UndefinedVar = value # create a variable + """ + from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, create_undefined_variable + + def create_var_like(o_var): + if isinstance(o_var, + (Variable, ) + support_ret_buildin_type) or o_var is None: + return create_undefined_variable() + if isinstance(o_var, (tuple, list)): + return [create_undefined_variable() for i in range(len(o_var))] + + if len(output_vars) != len(loop_vars): + raise ValueError("The length of loop_vars should be the same.") + + results = [] + for o_var, l_var in zip(output_vars, loop_vars): + if isinstance(l_var, UndefinedVar) or l_var is None: + results.append(create_var_like(o_var)) + else: + results.append(l_var) + return results + + def lod_rank_table(x, level=0): """ LoD Rank Table Operator. Given an input variable **x** and a level number @@ -2616,6 +2650,11 @@ def map_fn(x): def expand_undefined_var(nest1, nest2, names): + """ TODO: make this function recursively. + nest1: Var1, (UndefinedVar, [1,2,3]) + nest2: Var2, ([1,2,3,4], UndefinedVar) + In this case, we should not expand recursively. + """ from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_VALUE_PREFIX diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py index b544ca9bd8344..ce322db06cf8c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py @@ -385,6 +385,7 @@ def beam_search(self, inputs): dropout_implementation='upscale_in_train') else: step_input = new_hidden + cell_outputs = self._split_batch_beams(step_input) cell_outputs = self.fc(cell_outputs) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py index 683135b9078dc..ff3e0da6fea17 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py @@ -442,13 +442,6 @@ class TestErrorInForLoop(TestTransformForLoop): def _init_dyfunc(self): self.dyfunc = for_loop_dyfunc_not_support - def test_ast_to_func(self): - with self.assertRaisesRegexp( - NotImplementedError, - "Dynamic-to-Static only supports the step value is a constant or negative constant " - ): - self._run_static() - if __name__ == '__main__': with fluid.framework._test_eager_guard(): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py index c7cecab04f564..27debe00af10a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py @@ -66,6 +66,9 @@ def get_source_code(func): class StaticCode1(): def dyfunc_with_if_else(x_v, label=None): + loss = _jst.UndefinedVar('loss') + __return_1 = _jst.UndefinedVar('__return_1') + __return_0 = _jst.UndefinedVar('__return_0') __return_value_0 = None def get_args_0(): @@ -89,9 +92,6 @@ def false_fn_0(): _jst.IfElse( paddle.mean(x_v)[0] > 5, true_fn_0, false_fn_0, get_args_0, set_args_0, ('x_v', )) - __return_0 = _jst.UndefinedVar('__return_0') - __return_1 = _jst.UndefinedVar('__return_1') - loss = _jst.UndefinedVar('loss') def get_args_1(): nonlocal __return_0, __return_1, __return_value_0, loss @@ -123,6 +123,9 @@ def false_fn_1(): class StaticCode2(): # TODO: Transform return statement def dyfunc_with_if_else(x_v, label=None): + loss = _jst.UndefinedVar('loss') + __return_3 = _jst.UndefinedVar('__return_3') + __return_2 = _jst.UndefinedVar('__return_2') __return_value_1 = None def get_args_2(): @@ -146,9 +149,6 @@ def false_fn_2(): _jst.IfElse( paddle.mean(x_v)[0] > 5, true_fn_2, false_fn_2, get_args_2, set_args_2, ('x_v', )) - __return_2 = _jst.UndefinedVar('__return_2') - __return_3 = _jst.UndefinedVar('__return_3') - loss = _jst.UndefinedVar('loss') def get_args_3(): nonlocal __return_2, __return_3, __return_value_1, loss diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py index 9b1cde6dcc5e1..0d1dc69823a56 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py @@ -578,8 +578,8 @@ def _set_test_func(self): self.dygraph_func = dyfunc_with_for_1 def _set_expected_op_num(self): - self.expected_op_num = 22 - self.expected_shape_op_num = 3 + self.expected_op_num = 29 + self.expected_shape_op_num = 2 self.expected_slice_op_num = 3 @@ -589,7 +589,7 @@ def _set_test_func(self): self.dygraph_func = dyfunc_with_while_1 def _set_expected_op_num(self): - self.expected_op_num = 22 + self.expected_op_num = 21 self.expected_shape_op_num = 3 self.expected_slice_op_num = 3 diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index 2239c6544f219..57b6fc55efb97 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -21,7 +21,7 @@ from paddle.fluid.dygraph import Embedding, Layer, LayerNorm, Linear, to_variable from paddle.fluid.dygraph.jit import dygraph_to_static_func from paddle.fluid.layers.utils import map_structure -from paddle.fluid.layers.tensor import range as pd_range +import paddle def position_encoding_init(n_position, d_pos_vec): @@ -634,7 +634,7 @@ def gather(input, indices, batch_pos): value=0), } for i in range(self.n_layer)] - for i in pd_range(0, max_len, 1, dtype="int32"): + for i in range(paddle.to_tensor(max_len)): trg_pos = layers.fill_constant(shape=trg_word.shape, dtype="int64", value=i) diff --git a/python/paddle/jit/dy2static/__init__.py b/python/paddle/jit/dy2static/__init__.py index 0a51a3e265ede..ebb4d30a41212 100644 --- a/python/paddle/jit/dy2static/__init__.py +++ b/python/paddle/jit/dy2static/__init__.py @@ -26,7 +26,8 @@ from .convert_operators import convert_print as Print # noqa: F401 from .convert_operators import convert_shape as Shape # noqa: F401 from .convert_operators import convert_while_loop as While # noqa: F401 - +from .convert_operators import unpack_by_structure as Unpack # noqa: F401 +from .convert_operators import indexable as Indexable # noqa: F401 from .variable_trans_func import create_bool_as_type # noqa: F401 from .variable_trans_func import to_static_variable # noqa: F401 from .convert_operators import convert_shape_compare # noqa: F401 diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py index 59ffedef0a900..691c8c0cfbea3 100644 --- a/python/paddle/jit/dy2static/convert_operators.py +++ b/python/paddle/jit/dy2static/convert_operators.py @@ -26,5 +26,6 @@ from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_var_dtype # noqa: F401 from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_shape # noqa: F401 from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_while_loop # noqa: F401 +from ...fluid.dygraph.dygraph_to_static.convert_operators import unpack_by_structure, indexable # noqa: F401 __all__ = [] From f6ff2221a4337b213e914179c6060e4501982ad2 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Tue, 12 Jul 2022 20:22:17 +0800 Subject: [PATCH 151/250] fix fused attention, ffn, fm under new process group (#44259) --- .../operators/fused/fused_attention_op.cu | 36 +++++++++++++------ .../operators/fused/fused_feedforward_op.cu | 36 +++++++++++++------ .../fused/fused_multi_transformer_op.cu | 36 +++++++++++++------ 3 files changed, 78 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 0c33f7c9d4f9b..2c3fd75d8e012 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -24,11 +24,13 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/api/include/tensor.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -44,16 +46,30 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT const platform::CUDADeviceContext &ctx) { if (ring_id == -1) return; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto dtype = - platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); - int64_t numel = tensor.numel(); - const void *sendbuff = tensor.data(); - auto place = ctx.GetPlace(); - void *recvbuff = tensor.mutable_data(place); - auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); - auto stream = ctx.stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); + auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); + + if (map->has(ring_id)) { + paddle::distributed::ProcessGroup *pg = map->get(ring_id); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(tensor); + out_tensor.push_back(tensor); + paddle::distributed::AllreduceOptions opts; + opts.reduce_op = distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + auto dtype = platform::ToNCCLDataType( + framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void *sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void *recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); + } #else PADDLE_THROW(platform::errors::Unimplemented( "PaddlePaddle should compile with NCCL or RCCL when used tensor model " diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index fe388aa40566e..4126f5ad7263a 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -17,11 +17,13 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" #include "paddle/fluid/operators/matmul_v2_op.h" +#include "paddle/phi/api/include/tensor.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -37,16 +39,30 @@ static void AllReduce(framework::Tensor& tensor, // NOLINT const platform::CUDADeviceContext& ctx) { if (ring_id == -1) return; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto dtype = - platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); - int64_t numel = tensor.numel(); - const void* sendbuff = tensor.data(); - auto place = ctx.GetPlace(); - void* recvbuff = tensor.mutable_data(place); - auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); - auto stream = ctx.stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); + auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); + + if (map->has(ring_id)) { + paddle::distributed::ProcessGroup* pg = map->get(ring_id); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(tensor); + out_tensor.push_back(tensor); + paddle::distributed::AllreduceOptions opts; + opts.reduce_op = distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + auto dtype = platform::ToNCCLDataType( + framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void* sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void* recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); + } #else PADDLE_THROW(platform::errors::Unimplemented( "PaddlePaddle should compile with NCCL or RCCL when used tensor model " diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index fafbcf724d726..a8bebd5012db5 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -29,9 +29,11 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/api/include/tensor.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -50,16 +52,30 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT const platform::CUDADeviceContext &ctx) { if (ring_id == -1) return; #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto dtype = - platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); - int64_t numel = tensor.numel(); - const void *sendbuff = tensor.data(); - auto place = ctx.GetPlace(); - void *recvbuff = tensor.mutable_data(place); - auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); - auto stream = ctx.stream(); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( - sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); + auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance(); + + if (map->has(ring_id)) { + paddle::distributed::ProcessGroup *pg = map->get(ring_id); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(tensor); + out_tensor.push_back(tensor); + paddle::distributed::AllreduceOptions opts; + opts.reduce_op = distributed::ReduceOp::SUM; + auto task = pg->AllReduce(in_tensor, out_tensor, opts); + task->Wait(); + } else { + auto dtype = platform::ToNCCLDataType( + framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void *sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void *recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); + } #else PADDLE_THROW(platform::errors::Unimplemented( "PaddlePaddle should compile with NCCL or RCCL when used tensor model " From 60bad4649281c3583be17df51fe2018d2cab49a6 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Tue, 12 Jul 2022 21:14:40 +0800 Subject: [PATCH 152/250] Disable CompiledProgram with data parallel in StandaloneExecutor (#44264) * Disable CompiledProgram with data parallel in StandaloneExecutor * Fix typos * Fix typos --- python/paddle/fluid/executor.py | 43 +++++++++++++-------------------- 1 file changed, 17 insertions(+), 26 deletions(-) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 5f80e3b757770..7e450710d211c 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -409,18 +409,6 @@ def _is_enable_standalone_executor(): return flag -def _is_standalone_executor_enable_compiled_program(): - """ - Whether to use experimental executor `StandaloneExecutor` in CompiledProgram. - Convert Graph to Program. - """ - flag = False - env_val = os.environ.get('FLAGS_CONVERT_GRAPH_TO_PROGRAM', None) - if env_val in [1, '1', True, 'True', 'true']: - flag = True - return flag - - def _prepare_fleet_executor(): from ..distributed.fleet.proto import fleet_executor_desc_pb2 trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS", "") @@ -1405,18 +1393,22 @@ def _can_use_interpreter_core(program, place): return False compiled = isinstance(program, compiler.CompiledProgram) - # print("compiled is : {}".format(compiled)) - # NOTE(zhiqiu): do not support compiled program now if compiled: - if program._program is not None and _is_standalone_executor_enable_compiled_program( - ): - return True - return False - # if program._is_data_parallel and len( - # program._get_places(place, program._places)) == 1: - # return True - # else: - # return False + # Unsupported case 1 : the CompiledProgram is constructed by Graph + if program._program is None: + return False + + # Unsupported case 2 : disabled by FLAGS_CONVERT_GRAPH_TO_PROGRAM + if os.environ.get('FLAGS_CONVERT_GRAPH_TO_PROGRAM', + None) not in [1, '1', True, 'True', 'true']: + return False + + # Unsupported case 3: data parallel + if program._is_data_parallel == True and len( + program._get_places(place, program._places)) != 1: + return False + + return True else: if isinstance(program._graph, compiler.CompiledProgram): return False @@ -1447,11 +1439,10 @@ def _can_use_interpreter_core(program, place): # a little bit tricy here, use inner_program before _add_feed_fetch_ops to get key # while use program to geet _StandaloneExecutor if key not in self._executor_cache._cached_executors: + # To apply IR pass, compile the Program to IrGraph and convert it back to Program if isinstance(program, compiler.CompiledProgram): program._compile(scope, self.place) - compiled_graph = program._graph - ir_graph = framework.IrGraph(compiled_graph, - for_test=True) + ir_graph = framework.IrGraph(program._graph) inner_program = ir_graph.to_program() program = self._add_feed_fetch_ops( program=inner_program, From 9e9b02d3f5fba46b2d0126970c2b5083fdffc759 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Tue, 12 Jul 2022 21:31:31 +0800 Subject: [PATCH 153/250] [operator migration] Migrate unique consecutive infer shape and yaml (#44248) --- .../fluid/operators/unique_consecutive_op.cc | 56 +++-------------- paddle/phi/api/yaml/legacy_api.yaml | 9 +++ paddle/phi/infermeta/unary.cc | 60 +++++++++++++++++++ paddle/phi/infermeta/unary.h | 9 +++ .../unittests/test_unique_consecutive_op.py | 10 +++- python/paddle/tensor/manipulation.py | 13 +++- 6 files changed, 107 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/operators/unique_consecutive_op.cc b/paddle/fluid/operators/unique_consecutive_op.cc index 0a36af362deb0..97cd31141da2b 100644 --- a/paddle/fluid/operators/unique_consecutive_op.cc +++ b/paddle/fluid/operators/unique_consecutive_op.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,53 +25,6 @@ class UniqueConsecutiveOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique_consecutive"); - OP_INOUT_CHECK( - ctx->HasOutput("Out"), "Output", "Out", "unique_consecutive"); - - auto in_dims = ctx->GetInputDim("X"); - bool return_inverse = ctx->Attrs().Get("return_inverse"); - bool return_counts = ctx->Attrs().Get("return_counts"); - auto axis_vec = ctx->Attrs().Get>("axis"); - if (return_inverse) { - OP_INOUT_CHECK( - ctx->HasOutput("Index"), "Output", "Index", "unique_consecutive"); - } - if (return_counts) { - OP_INOUT_CHECK( - ctx->HasOutput("Counts"), "Output", "Counts", "unique_consecutive"); - } - - if (axis_vec.empty()) { - ctx->SetOutputDim("Out", {-1}); - if (return_inverse) { - ctx->SetOutputDim("Index", {phi::product(in_dims)}); - } - } else { - int axis = axis_vec[0]; - if (axis < 0) { - axis += in_dims.size(); - } - PADDLE_ENFORCE_LT( - axis, - in_dims.size(), - platform::errors::InvalidArgument("The axis(%d) should be less than " - "the dimension size(%d) of x.", - axis, - in_dims.size())); - auto out_dims = in_dims; - out_dims[axis] = -1; - ctx->SetOutputDim("Out", out_dims); - if (return_inverse) { - ctx->SetOutputDim("Index", {in_dims[axis]}); - } - } - if (return_counts) { - ctx->SetOutputDim("Counts", {-1}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -114,9 +70,13 @@ class UniqueConsecutiveOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(unique_consecutive, + UniqueConsecutiveInferShapeFunctor, + PD_INFER_META(phi::UniqueConsecutiveInferMeta)); REGISTER_OP_WITHOUT_GRADIENT(unique_consecutive, ops::UniqueConsecutiveOp, - ops::UniqueConsecutiveOpMaker); + ops::UniqueConsecutiveOpMaker, + UniqueConsecutiveInferShapeFunctor); REGISTER_OP_VERSION(unique_consecutive) .AddCheckpoint( R"ROC( diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index c307fc7a19d5d..aa86c0f34db55 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -2205,6 +2205,15 @@ func : unique data_type : x +- api : unique_consecutive + args : (Tensor x, bool return_inverse, bool return_counts, int[] axis, int dtype) + output : Tensor(out), Tensor(index), Tensor(counts) + infer_meta : + func : UniqueConsecutiveInferMeta + kernel : + func : unique_consecutive + data_type : x + - api : unsqueeze args : (Tensor x, IntArray axis) output : Tensor(out), Tensor(xshape) diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 9c5286c066a2b..0048f130adf62 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -2999,6 +2999,66 @@ void UnfoldInferMeta(const MetaTensor& x, out->set_dims(phi::make_ddim(out_dims)); } +void UniqueConsecutiveInferMeta(const MetaTensor& x, + bool return_inverse, + bool return_counts, + const std::vector& axis, + int dtype, + MetaTensor* out, + MetaTensor* index, + MetaTensor* counts) { + PADDLE_ENFORCE_NE(out, + nullptr, + phi::errors::InvalidArgument( + "unique_consecutive should have output tensor out.")); + + auto in_dims = x.dims(); + if (return_inverse) { + PADDLE_ENFORCE_NE( + index, + nullptr, + phi::errors::InvalidArgument("Tensor index should not be null if " + "return_inverse is set to True.")); + } + if (return_counts) { + PADDLE_ENFORCE_NE( + counts, + nullptr, + phi::errors::InvalidArgument("Tensor counts should not be null if " + "return_counts is set to True.")); + } + + if (axis.empty()) { + out->set_dims({-1}); + out->set_dtype(x.dtype()); + if (return_inverse) { + index->set_dims({phi::product(in_dims)}); + } + } else { + int axis_value = axis[0]; + if (axis_value < 0) { + axis_value += in_dims.size(); + } + PADDLE_ENFORCE_LT( + axis_value, + in_dims.size(), + phi::errors::InvalidArgument("The axis(%d) should be less than " + "the dimension size(%d) of x.", + axis_value, + in_dims.size())); + auto out_dims = in_dims; + out_dims[axis_value] = -1; + out->set_dims(out_dims); + out->set_dtype(x.dtype()); + if (return_inverse) { + index->set_dims({in_dims[axis_value]}); + } + } + if (return_counts) { + counts->set_dims({-1}); + } +} + void UniqueInferMeta(const MetaTensor& x, bool return_index, bool return_inverse, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 591fb9553a1eb..0b9298cfd362f 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -420,6 +420,15 @@ void UnfoldInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void UniqueConsecutiveInferMeta(const MetaTensor& x, + bool return_inverse, + bool return_counts, + const std::vector& axis, + int dtype, + MetaTensor* out, + MetaTensor* index, + MetaTensor* counts); + void UniqueInferMeta(const MetaTensor& x, bool return_index, bool return_inverse, diff --git a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py index b4a4eac0ba74f..8ec0bcca4bc4e 100644 --- a/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py +++ b/python/paddle/fluid/tests/unittests/test_unique_consecutive_op.py @@ -72,6 +72,7 @@ def config(self): self.x_range = 20 self.return_inverse = False self.return_counts = False + self.python_api = paddle.unique_consecutive def init_kernel_type(self): self.dtype = "float32" if core.is_compiled_with_rocm() else "float64" @@ -88,13 +89,14 @@ def setUp(self): self.inputs = { 'X': x, } + self.python_out_sig = ["Out"] self.attrs = {'dtype': int(core.VarDesc.VarType.INT32)} self.outputs = { 'Out': out, } def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) class TestUniqueConsecutiveOp2(TestUniqueConsecutiveOp): @@ -105,6 +107,7 @@ def config(self): self.x_range = 20 self.return_inverse = True self.return_counts = False + self.python_api = paddle.unique_consecutive def setUp(self): self.init_kernel_type() @@ -122,6 +125,7 @@ def setUp(self): 'return_inverse': self.return_inverse, 'dtype': int(core.VarDesc.VarType.INT32) } + self.python_out_sig = ["Out"] self.outputs = {'Out': result, 'Index': inverse} @@ -133,6 +137,7 @@ def config(self): self.x_range = 20 self.return_inverse = False self.return_counts = True + self.python_api = paddle.unique_consecutive def setUp(self): self.init_kernel_type() @@ -150,6 +155,7 @@ def setUp(self): 'return_counts': self.return_counts, 'dtype': int(core.VarDesc.VarType.INT32) } + self.python_out_sig = ["Out"] self.outputs = {'Out': result, 'Counts': counts} @@ -161,6 +167,7 @@ def config(self): self.x_range = 20 self.return_inverse = True self.return_counts = True + self.python_api = paddle.unique_consecutive def setUp(self): self.init_kernel_type() @@ -180,6 +187,7 @@ def setUp(self): 'return_counts': self.return_counts, 'dtype': int(core.VarDesc.VarType.INT32) } + self.python_out_sig = ["Out"] self.outputs = {'Out': result, 'Index': inverse, 'Counts': counts} diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index c445402412e16..8d2bfa2a2cb64 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -2066,7 +2066,18 @@ def unique_consecutive(x, else: axis = [axis] attr_dtype = convert_np_dtype_to_dtype_(dtype) - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + out, inverse, counts = _C_ops.final_state_unique_consecutive( + x, return_inverse, return_counts, axis, attr_dtype) + outs = [out] + if return_inverse: + outs.append(inverse) + if return_counts: + outs.append(counts) + if len(outs) == 1: + return outs[0] + return tuple(outs) + elif paddle.in_dynamic_mode(): out, inverse, counts = _C_ops.unique_consecutive( x, 'dtype', attr_dtype, 'return_inverse', return_inverse, 'return_counts', return_counts, 'axis', axis) From 015532b4eb19627b8324efe7e6a77aaeb4b541f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Tue, 12 Jul 2022 21:32:39 +0800 Subject: [PATCH 154/250] add xpu_kp support for standalone executor. test=develop (#44231) --- paddle/fluid/framework/operator.cc | 37 +++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 0a5de2bd3f262..0fca87df34f5a 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1300,17 +1300,38 @@ bool OperatorWithKernel::SupportsKernelType( const OpKernelType& kernel_type) const { auto& all_op_kernels = AllOpKernels(); auto kernels_iter = all_op_kernels.find(type_); - bool support = - kernels_iter != all_op_kernels.end() && - kernels_iter->second.find(kernel_type) != kernels_iter->second.end(); -#if defined(PADDLE_WITH_XPU) + if (kernels_iter == all_op_kernels.end()) return false; + OpKernelMap& kernels = kernels_iter->second; + auto kernel_iter = kernels.find(kernel_type); + +#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) if (paddle::platform::is_xpu_place(kernel_type.place_)) { - support = support && - paddle::platform::is_xpu_support_op(type_, kernel_type) && - !paddle::platform::is_in_xpu_black_list(type_); + return kernel_iter != kernels.end() && + paddle::platform::is_xpu_support_op(type_, kernel_type) && + !paddle::platform::is_in_xpu_black_list(type_); } #endif - return support; + +#ifdef PADDLE_WITH_XPU_KP + if (paddle::platform::is_xpu_place(kernel_type.place_)) { + bool use_xpu_kp_kernel_rt = + FLAGS_run_kp_kernel && + paddle::platform::is_xpu_kp_support_op(type_, kernel_type); + bool use_xpu_kp_kernel_debug = + paddle::platform::is_in_xpu_kpwhite_list(type_); + bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug); + if (is_xpu_kp_support) { + auto tmp_kernel_type = kernel_type; + tmp_kernel_type.library_type_ = LibraryType::kKP; + return kernels.find(tmp_kernel_type) != kernels.end(); + } + return kernel_iter != kernels.end() && + paddle::platform::is_xpu_support_op(type_, kernel_type) && + !paddle::platform::is_in_xpu_black_list(type_); + } +#endif + + return kernel_iter != kernels.end(); } bool OperatorWithKernel::CanMKLDNNBeUsed(const framework::ExecutionContext& ctx, From c797e64db0837ed21c0440572302a95217bb3d89 Mon Sep 17 00:00:00 2001 From: "joanna.wozna.intel" Date: Tue, 12 Jul 2022 17:03:02 +0200 Subject: [PATCH 155/250] Add pool avg to quantization and concat scales correction (#44186) --- .../compute_propagate_scales_mkldnn_pass.cc | 7 ++++++ .../ir/mkldnn/quant_dequant_mkldnn_pass.cc | 18 --------------- .../quantization/quant2_int8_mkldnn_pass.py | 23 +++++++++---------- 3 files changed, 18 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc index f7ee6a96dce04..99eaab49b7926 100644 --- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc @@ -390,6 +390,13 @@ std::unordered_set ComputePropagateScalesMkldnnPass::UpdateScales( } else if (out_iter != var_quant_scales->end()) { (*var_quant_scales)[input_name] = out_iter->second; } + } else if (op_name == "concat") { + auto out_iter = var_quant_scales->find(op_node->Op()->Output("Out")[0]); + if (out_iter != var_quant_scales->end()) { + std::vector input_names = op_node->Op()->Input("X"); + for (auto input_name : input_names) + (*var_quant_scales)[input_name] = out_iter->second; + } } else if (op_name == "scale") { const std::string output_name = op_node->Op()->Output("Out")[0]; auto out_iter = var_quant_scales->find(output_name); diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc index 40c6050a3c3f1..42c54fcb36242 100644 --- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc @@ -55,23 +55,6 @@ void QuantDequantMkldnnPass::MarkSkipQuantizedOps( } } -void QuantDequantMkldnnPass::MarkSkipQuantizedPool2d(ir::Graph* graph) const { - VLOG(3) << "mark avg pool2d as skip quantized op"; - for (auto* op_node : - ir::TopologyVarientSort(*graph, static_cast(0))) { - if (!op_node->IsOp()) continue; - - if (op_node->Name() == "pool2d") { - auto* op_desc = op_node->Op(); - auto pool_type = - BOOST_GET_CONST(std::string, op_desc->GetAttr("pooling_type")); - if (pool_type == "avg") { - op_node->Op()->SetAttr("skip_quant", 1); - } - } - } -} - void QuantDequantMkldnnPass::CollectInfoFromFake( ir::Graph* graph, Scope* scope, @@ -548,7 +531,6 @@ void QuantDequantMkldnnPass::ApplyImpl(ir::Graph* graph) const { auto* scope = param_scope(); MarkSkipQuantizedOps(graph, skip_ops); - MarkSkipQuantizedPool2d(graph); CollectInfoFromFake(graph, scope, fake_dequantize_types, &weight_thresholds); CollectInputScalesFromFake( graph, scope, fake_quantize_types, &var_quant_scales); diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index 0d17673a2d522..2f155ca0edfc2 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -264,6 +264,14 @@ def _update_scales(graph): elif output_name in self._var_quant_scales: self._var_quant_scales[ input_name] = self._var_quant_scales[output_name] + elif op.name() == 'concat': + output_name = op.output("Out")[0] + if output_name in self._var_quant_scales: + input_names = op.input("X") + for input_name in input_names: + self._var_quant_scales[ + input_name] = self._var_quant_scales[ + output_name] elif op.name() in self._scale_ops: input_name = op.input("X")[0] output_name = op.output("Out")[0] @@ -595,13 +603,6 @@ def _compute_lstm_weight_scales(wx_name, wh_name): _compute_lstm_weight_scales("WeightX", "WeightH") return graph - def _find_avg_pooling_ids(self, graph): - for op in graph.all_op_nodes(): - if op.name() in self._pool_ops: - if op.op().attr("pooling_type") == "avg": - self._op_ids_to_skip.add(op.id()) - return self._op_ids_to_skip - def _update_relu_output_scales(self, graph): def _set_unsigned_scale(graph, ops, op_out_name, predicate): @@ -651,11 +652,9 @@ def _quantize_fp32_graph(self, graph): 'reshape_transpose_matmul_mkldnn_fuse_pass') graph = self._apply_pass( graph, 'reshape_transpose_matmul_v2_mkldnn_fuse_pass') - graph = self._apply_pass( - graph, 'cpu_quantize_placement_pass', - ['quantize_enabled_op_types', 'quantize_excluded_op_ids'], - [self._ops_to_quantize, - self._find_avg_pooling_ids(graph)]) + graph = self._apply_pass(graph, 'cpu_quantize_placement_pass', + ['quantize_enabled_op_types'], + [self._ops_to_quantize]) graph = self._apply_pass( graph, 'cpu_quantize_pass', ['quant_var_scales', 'data_layout'], [self._var_quant_scales, From 0470e9da25f3dfcab5bf3a34a8d2540af71e783e Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Wed, 13 Jul 2022 10:00:27 +0800 Subject: [PATCH 156/250] add grid_sampler and update relu op for xpu. (#44227) * grid sampler op for xpu. test=kunlun * update relu xdnn api. test=kunlun. --- cmake/external/xpu.cmake | 4 +- paddle/fluid/operators/activation_op_xpu.cc | 27 +- paddle/fluid/operators/grid_sampler_op_xpu.cc | 138 +++++++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 2 + .../unittests/xpu/test_grid_sampler_op_xpu.py | 284 ++++++++++++++++++ 5 files changed, 444 insertions(+), 11 deletions(-) create mode 100644 paddle/fluid/operators/grid_sampler_op_xpu.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 3228f5a556c2e..25d01912f1419 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220707") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220708") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() @@ -19,7 +19,7 @@ endif() if(NOT DEFINED XPU_XDNN_BASE_URL) set(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220707") + set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220708") else() set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc index 67a1d70ebad44..613eea90a6500 100644 --- a/paddle/fluid/operators/activation_op_xpu.cc +++ b/paddle/fluid/operators/activation_op_xpu.cc @@ -157,15 +157,6 @@ struct XPUReciprocalGradFunctor : public BaseActivationFunctor { } }; -template -struct XPUReluFunctor : public BaseActivationFunctor { - using XPUType = typename XPUTypeTrait::Type; - void operator()(const framework::ExecutionContext &ctx) const { - xpu_activation_forward( - ctx, xpu::relu); - } -}; - template struct XPUReluGradFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; @@ -416,6 +407,24 @@ struct XPUPowGradFunctor : public BaseActivationFunctor { } }; +template +struct XPUReluFunctor : public BaseActivationFunctor { + using XPUType = typename XPUTypeTrait::Type; + void operator()(const framework::ExecutionContext &ctx) const { + const auto *x = ctx.Input("X"); + auto *y = ctx.Output("Out"); + const XPUType *x_data = reinterpret_cast(x->data()); + XPUType *y_data = + reinterpret_cast(y->mutable_data(ctx.GetPlace())); + + auto xpu_context = + ctx.device_context().x_context(); + int r = + xpu::relu(xpu_context, x_data, y_data, x->numel(), nullptr, nullptr); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu"); + } +}; + template struct XPUSoftPlusFunctor : public BaseActivationFunctor { void operator()(const framework::ExecutionContext &ctx) const { diff --git a/paddle/fluid/operators/grid_sampler_op_xpu.cc b/paddle/fluid/operators/grid_sampler_op_xpu.cc new file mode 100644 index 0000000000000..2843a90492cec --- /dev/null +++ b/paddle/fluid/operators/grid_sampler_op_xpu.cc @@ -0,0 +1,138 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_XPU + +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device/xpu/xpu_header.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class GridSamplerXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE_EQ( + platform::is_xpu_place(context.GetPlace()), + true, + platform::errors::Unavailable("This kernel only runs on XPU.")); + + // input and output data + const Tensor* input = context.Input("X"); + const Tensor* grid = context.Input("Grid"); + Tensor* output = context.Output("Output"); + + int n = input->dims()[0]; + int c = input->dims()[1]; + int h = input->dims()[2]; + int w = input->dims()[3]; + int out_h = grid->dims()[1]; + int out_w = grid->dims()[2]; + + // attrs + // paddle.nn.functional.grid_sample(x, grid, mode='bilinear', + // padding_mode='zeros', align_corners=True, name=None) + const std::string mode = context.Attr("mode"); + const std::string padding_mode = context.Attr("padding_mode"); + bool align_corners_bool = context.Attr("align_corners"); + const std::string data_format = + paddle::framework::DataLayoutToString(input->layout()); + + // attr to real param + bool is_nearest_bool; + if (mode == "bilinear") { + is_nearest_bool = false; + } else if (mode == "nearest") { + is_nearest_bool = true; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "should not reach here: mode should be either 'bilinear' or " + "'nearest', bot got %s.", + mode)); + } + + // attention: 0: zeros, 2: reflection, 1: border according to XDNN api. + int padding_mode_int; + if (padding_mode == "zeros") { + padding_mode_int = 0; + } else if (padding_mode == "reflection") { + padding_mode_int = 2; + } else if (padding_mode == "border") { + padding_mode_int = 1; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "should not reach here: padding_mode should be either 'zeros' or " + "'reflection' or 'border', bot got %s.", + padding_mode)); + } + + bool is_nchw_bool; + if (data_format == "NCHW") { + is_nchw_bool = true; + } else if (data_format == "NHWC") { + is_nchw_bool = false; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "should not reach here: data_format should be either 'NCHW' or " + "'NHWC', bot got %s.", + data_format)); + } + + // data pointers + const T* input_data = input->data(); + const T* grid_data = grid->data(); + T* output_data = + output->mutable_data({n, c, out_h, out_w}, context.GetPlace()); + + auto& dev_ctx = context.template device_context(); + // int grid_sample(Context* ctx, const T* x, const T* grid, T* y, int n, int + // c, int xh, int xw, int yh, int yw, bool is_nearest, bool align_corners, + // int padding_mode, bool is_nchw); + int r = xpu::grid_sample(dev_ctx.x_context(), + input_data, + grid_data, + output_data, + n, + c, + h, + w, + out_h, + out_w, + is_nearest_bool, + align_corners_bool, + padding_mode_int, + is_nchw_bool); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "grid_sampler"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL( + grid_sampler, + ops::GridSamplerXPUKernel); + +#endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index c5a70b03cd3c8..7e9c61289b67f 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -240,6 +240,8 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"grid_sampler", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"hard_swish_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, diff --git a/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py new file mode 100644 index 0000000000000..967815cc559ee --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py @@ -0,0 +1,284 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys + +sys.path.append("..") + +import paddle + +from op_test import OpTest +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +def AffineGrid(theta, grid_shape): + n = grid_shape[0] + h = grid_shape[1] + w = grid_shape[2] + h_idx = np.repeat(np.linspace(-1, 1, h)[np.newaxis, :], w, + axis=0).T[:, :, np.newaxis] + w_idx = np.repeat(np.linspace(-1, 1, w)[np.newaxis, :], h, + axis=0)[:, :, np.newaxis] + grid = np.concatenate([w_idx, h_idx, np.ones([h, w, 1])], + axis=2) # h * w * 3 + grid = np.repeat(grid[np.newaxis, :], n, axis=0) # n * h * w *3 + + ret = np.zeros([n, h * w, 2]) + theta = theta.transpose([0, 2, 1]) + for i in range(len(theta)): + ret[i] = np.dot(grid[i].reshape([h * w, 3]), theta[i]) + + return ret.reshape([n, h, w, 2]).astype("float64") + + +def getGridPointValue(data, x, y): + data_shape = data.shape + N = data_shape[0] + C = data_shape[1] + in_H = data_shape[2] + in_W = data_shape[3] + out_H = x.shape[1] + out_W = x.shape[2] + + #out = np.zeros(data_shape, dtype='float64') + out = np.zeros([N, C, out_H, out_W], dtype='float64') + for i in range(N): + for j in range(out_H): + for k in range(out_W): + if y[i, j, k] < 0 or y[i, j, k] > in_H - 1 or x[ + i, j, k] < 0 or x[i, j, k] > in_W - 1: + out[i, :, j, k] = 0 + else: + out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]] + + return out + + +def clip(x, min_n, max_n): + return np.maximum(np.minimum(x, max_n), min_n) + + +def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode): + if align_corners: + grid_slice = 0.5 * ((grid_slice.astype('float64') + 1.0) * max_val) + else: + grid_slice = 0.5 * ((grid_slice.astype('float64') + 1.0) * + (max_val + 1)) - 0.5 + + if padding_mode == "border": + grid_slice = clip(grid_slice, 0, max_val) + elif padding_mode == "reflection": + double_range = 2 * max_val if align_corners else (max_val + 1) * 2 + grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice + + 0.5) + extra = grid_abs - np.floor(grid_abs / double_range) * double_range + grid_slice = np.minimum(extra, double_range - extra) + grid_slice = grid_slice if align_corners else clip( + grid_slice - 0.5, 0, max_val) + return grid_slice + + +def GridSampler(data, + grid, + align_corners=True, + mode="bilinear", + padding_mode="zeros"): + dims = data.shape + N = dims[0] + in_C = dims[1] + in_H = dims[2] + in_W = dims[3] + + out_H = grid.shape[1] + out_W = grid.shape[2] + + x = grid[:, :, :, 0] + y = grid[:, :, :, 1] + y_max = in_H - 1 + x_max = in_W - 1 + + x = unnormalizeAndClip(x, x_max, align_corners, padding_mode) + y = unnormalizeAndClip(y, y_max, align_corners, padding_mode) + + if mode == "bilinear": + x0 = np.floor(x).astype('int32') + x1 = x0 + 1 + y0 = np.floor(y).astype('int32') + y1 = y0 + 1 + + wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)), + (1, in_C, 1, 1)) + wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)), + (1, in_C, 1, 1)) + wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)), + (1, in_C, 1, 1)) + wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)), + (1, in_C, 1, 1)) + + va = getGridPointValue(data, x0, y0) + vb = getGridPointValue(data, x0, y1) + vc = getGridPointValue(data, x1, y0) + vd = getGridPointValue(data, x1, y1) + + out = (wa * va + wb * vb + wc * vc + wd * vd).astype('float64') + elif mode == "nearest": + x = np.round(x).astype('int32') + y = np.round(y).astype('int32') + out = getGridPointValue(data, x, y) + return out + + +class XPUTestGridSamplerOP(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'grid_sampler' + self.use_dynamic_create_class = False + + class TestXPUGridSamplerOp(XPUOpTest): + + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_dtype() + self.op_type = 'grid_sampler' + + self.use_cudnn = False + self.align_corners = True + self.padding_mode = "zeros" + self.mode = "bilinear" + + self.initTestCase() + + x = np.random.uniform(-10, 10, self.x_shape).astype(self.dtype) + + theta = np.zeros(self.theta_shape).astype(self.dtype) + for i in range(self.theta_shape[0]): + for j in range(2): + for k in range(3): + theta[i, j, k] = np.random.rand(1)[0] + grid = AffineGrid(theta, self.grid_shape).astype(self.dtype) + + self.inputs = {'X': x, 'Grid': grid} + self.attrs = { + 'use_cudnn': self.use_cudnn, + "align_corners": self.align_corners, + "padding_mode": self.padding_mode, + "mode": self.mode, + } + self.outputs = { + 'Output': + GridSampler(x, grid, self.align_corners, self.mode, + self.padding_mode) + } + + def initTestCase(self): + self.x_shape = (2, 3, 8, 8) + self.grid_shape = (2, 7, 9, 2) + self.theta_shape = (2, 2, 3) + self.align_corners = True + self.padding_mode = "zeros" + self.mode = "bilinear" + + def init_dtype(self): + self.dtype = self.in_type + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X', 'Grid'], 'Output') + + class TestGridSample1(TestXPUGridSamplerOp): + + def initTestCase(self): + self.x_shape = (2, 3, 5, 6) + self.grid_shape = (2, 8, 9, 2) + self.theta_shape = (2, 2, 3) + self.align_corners = False + self.padding_mode = "zeros" + self.mode = "bilinear" + + class TestGridSample2(TestXPUGridSamplerOp): + + def initTestCase(self): + self.x_shape = (2, 3, 5, 6) + self.grid_shape = (2, 8, 9, 2) + self.theta_shape = (2, 2, 3) + self.align_corners = False + self.padding_mode = "border" + self.mode = "bilinear" + + class TestGridSample3(TestXPUGridSamplerOp): + + def initTestCase(self): + self.x_shape = (2, 3, 5, 6) + self.grid_shape = (2, 8, 9, 2) + self.theta_shape = (2, 2, 3) + self.align_corners = False + self.padding_mode = "reflection" + self.mode = "bilinear" + + class TestGridSample4(TestXPUGridSamplerOp): + + def initTestCase(self): + self.x_shape = (2, 3, 5, 6) + self.grid_shape = (2, 8, 9, 2) + self.theta_shape = (2, 2, 3) + self.align_corners = True + self.padding_mode = "reflection" + self.mode = "bilinear" + + class TestGridSample5(TestXPUGridSamplerOp): + + def initTestCase(self): + self.x_shape = (2, 3, 5, 6) + self.grid_shape = (2, 8, 9, 2) + self.theta_shape = (2, 2, 3) + self.align_corners = False + self.padding_mode = "reflection" + self.mode = "nearest" + + class TestGridSample6(TestXPUGridSamplerOp): + + def initTestCase(self): + self.x_shape = (2, 3, 128, 128) + self.grid_shape = (2, 130, 130, 2) + self.theta_shape = (2, 2, 3) + self.align_corners = False + self.padding_mode = "reflection" + self.mode = "bilinear" + + class TestGridSample7(TestXPUGridSamplerOp): + + def initTestCase(self): + self.x_shape = (2, 3, 128, 128) + self.grid_shape = (2, 130, 130, 2) + self.theta_shape = (2, 2, 3) + self.align_corners = True + self.padding_mode = "zeros" + self.mode = "bilinear" + + +support_types = get_xpu_op_support_types('grid_sampler') +for stype in support_types: + create_test_class(globals(), XPUTestGridSamplerOP, stype) + +if __name__ == '__main__': + unittest.main() From b809be1acbd8b25581d26697418053371a4669e9 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Wed, 13 Jul 2022 10:02:24 +0800 Subject: [PATCH 157/250] Support zero dims input for eager run program OP (#44273) --- .../eager/to_static/run_program_op_node.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 4d08146f7aafe..2af2bd369b42b 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -58,13 +58,14 @@ static void CheckInputVarStatus(const Tensor &tensor) { "wrong type. Expect type is DenseTensor.", tensor.name())); - PADDLE_ENFORCE_EQ(tensor.initialized(), - true, - paddle::platform::errors::InvalidArgument( - "The tensor in input tensor %s of " - "RunProgram(Grad)Op " - "is not initialized.", - tensor.name())); + PADDLE_ENFORCE_EQ( + static_cast(tensor.impl().get())->IsInitialized(), + true, + paddle::platform::errors::InvalidArgument( + "The tensor in input tensor %s of " + "RunProgram(Grad)Op " + "is not initialized.", + tensor.name())); } static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, @@ -84,7 +85,7 @@ static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, "RunProgram(Grad)Op's internal scope holds " "wrong type. Expect type is DenseTensor", name)); - PADDLE_ENFORCE_EQ(src_tensor.initialized(), + PADDLE_ENFORCE_EQ(src_tensor.IsInitialized(), true, paddle::platform::errors::InvalidArgument( "The tensor in output tensor %s get from " @@ -120,7 +121,7 @@ static void ShareTensorsIntoScope(const std::vector &tensors, paddle::framework::Scope *scope) { for (size_t i = 0; i < tensors.size(); ++i) { auto name = tensors[i].name(); - if (name == "Fake_var" || !tensors[i].initialized()) { + if (name == "Fake_var") { continue; } auto *var = scope->Var(name); From 8dd1820808a36185ee60f2a506f6972a4247255b Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Wed, 13 Jul 2022 10:07:26 +0800 Subject: [PATCH 158/250] [Paddle-TRT] fix convtranspose and elementwise in op_teller (#44147) * fix convtranspose and elementwise --- .../tensorrt/convert/elementwise_op.cc | 11 +- paddle/fluid/inference/tensorrt/op_teller.cc | 54 ++++++- .../test_trt_convert_conv2d_transpose.py | 118 ++++++++++++++ .../inference/test_trt_convert_elementwise.py | 151 ++++++++++++++---- 4 files changed, 293 insertions(+), 41 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 365523508f5df..7fd89dd731a8e 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -62,11 +62,13 @@ class ElementwiseTensorOpConverter : public OpConverter { } else { Y = engine_->GetITensor(op_desc.Input("Y").front()); } - + bool swap_xy = false; + // Swap X and Y if (X->getDimensions().nbDims < Y->getDimensions().nbDims) { auto* tmp = X; X = Y; Y = tmp; + swap_xy = true; } nvinfer1::Dims dims_x = X->getDimensions(); nvinfer1::Dims dims_y = Y->getDimensions(); @@ -130,6 +132,13 @@ class ElementwiseTensorOpConverter : public OpConverter { reshape_y_tensor = Y; } + // We should swap X and Y back, because some operators do not have symmetry + if (swap_xy) { + auto* tmp = reshape_y_tensor; + reshape_y_tensor = X; + X = tmp; + } + auto op_pair = ops.find(op_type_); PADDLE_ENFORCE_NE(op_pair, ops.end(), diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 190f6c731a3b4..89019835a65fd 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -325,6 +325,28 @@ bool OpTeller::Tell(const framework::ir::Node* node, #endif } + // In static shape mode in TRT, we can't allow that op's input is a + // 1D-tensor So we filter it here. Some op like elementwise having "Y" too, + // but that is dealt with in the specified op, here just the common case + if (!with_dynamic_shape) { + std::string X_name; + auto inputs = desc.Inputs(); + if (inputs.count("X")) { + X_name = desc.Input("X")[0]; + } else if (inputs.count("Input")) { + X_name = desc.Input("Input")[0]; + } + auto* block = desc.Block(); + if (block) { + auto* x_var_desc = block->FindVar(X_name); + // Can't get feed op's TensorDesc + if (op_type != "feed" && x_var_desc && !x_var_desc->Persistable()) { + const auto x_shape = x_var_desc->GetShape(); + if (x_shape.size() == 1) return false; + } + } + } + if (op_type == "pool2d") { std::vector paddings = BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); @@ -1309,14 +1331,19 @@ bool OpTeller::Tell(const framework::ir::Node* node, auto* y_var_desc = block->FindVar(desc.Input("Y")[0]); const auto x_shape = x_var_desc->GetShape(); const auto y_shape = y_var_desc->GetShape(); - if (x_shape.size() == 1 && y_shape.size() == 1) { - VLOG(3) << "Now trt may not support two 1d tensor elementwise op."; + + // The case when x_shape.size() == 1 is dealt with in common case + if (!with_dynamic_shape && (!y_var_desc->Persistable()) && + y_shape.size() == 1) { + VLOG(3) << "Static shape in trt not support y is a 1D intermediate " + "tensor in " + "elementwise op."; return false; } - if (x_var_desc->Persistable()) { - VLOG(3) << "Input X is a parameter which is not supported for " - "elementwise_add/elementwise_mul in tensorrt, swap x and " - "y will work"; + if (x_var_desc->Persistable() && !with_dynamic_shape) { + VLOG(3) + << "Input X is a parameter which is not supported for " + "elementwise in tensorrt's static shape, swap x and y will work"; return false; } } @@ -1912,6 +1939,21 @@ bool OpTeller::Tell(const framework::ir::Node* node, } #endif + // conv2d_transpose, conv3d_transpose, depthwise_conv2d_transpose + if (op_type.find("d_transpose") > 0) { + // trt doen't support output_padding, + // output_padding is set when stride > 1 + if (desc.HasAttr("output_padding")) { + const std::vector output_padding = + BOOST_GET_CONST(std::vector, desc.GetAttr("output_padding")); + if (output_padding.size() > 0) { + int max_padding = + *std::max_element(output_padding.begin(), output_padding.end()); + if (max_padding > 0) return false; + } + } + } + if (op_type == "conv3d" || op_type == "conv3d_transpose") { if (desc.HasAttr("padding_algorithm")) { std::string padding_algorithm = diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py index 0db051560516d..cab61143b7737 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py @@ -219,5 +219,123 @@ def test_quant(self): self.run_test(quant=True) +# Special case +class TrtConvertConv2dTransposeTest2(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000: + return False + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(batch, num_channels, attrs: List[Dict[str, Any]]): + return np.ones([batch, num_channels, 20, 30]).astype(np.float32) + + def generate_weight1(num_channels, attrs: List[Dict[str, Any]]): + return np.random.random([num_channels, 64, 3, 3]).astype(np.float32) + + num_channels = 128 + batch = 1 + + self.num_channels = num_channels + dics = [{ + "data_fromat": 'NCHW', + "dilations": [1, 1], + "padding_algorithm": 'EXPLICIT', + "groups": 1, + "paddings": [1, 1], + "strides": [2, 2], + "output_padding": [1, 1], + "output_size": [], + }] + + ops_config = [{ + "op_type": "conv2d_transpose", + "op_inputs": { + "Input": ["input_data"], + "Filter": ["conv2d_weight"] + }, + "op_outputs": { + "Output": ["output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "conv2d_weight": + TensorConfig( + data_gen=partial(generate_weight1, num_channels, dics)) + }, + inputs={ + "input_data": + TensorConfig(data_gen=partial(generate_input1, batch, + num_channels, dics)) + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = { + "input_data": [1, 128, 20, 30], + } + self.dynamic_shape.max_input_shape = { + "input_data": [1, 128, 20, 30], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 128, 20, 30], + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 0, 3 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), (1e-5, 1e-3) + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-3) + + def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + def test_quant(self): + self.add_skip_trt_case() + self.run_test(quant=True) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index db011c5bd54f6..c692b3f9d677f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -124,6 +124,101 @@ def test(self): self.run_test() +# This is the special test case +class TrtConvertElementwiseTest_one_input_special_case1(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + + def generate_input(shape): + return np.random.random(shape).astype(np.float32) + + def generate_weight(): + return np.random.randn(1).astype(np.float32) + + for shape in [[32]]: + for op_type in ["elementwise_add", "elementwise_mul"]: + for axis in [-1]: + self.dims = len(shape) + dics = [{"axis": axis}] + ops_config = [{ + "op_type": op_type, + "op_inputs": { + "X": ["input_data"], + "Y": ["weight"] + }, + "op_outputs": { + "Out": ["output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "weight": + TensorConfig(data_gen=partial(generate_weight)) + }, + inputs={ + "input_data": + TensorConfig( + data_gen=partial(generate_input, shape)), + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = {"input_data": [32]} + self.dynamic_shape.max_input_shape = {"input_data": [64]} + self.dynamic_shape.opt_input_shape = {"input_data": [32]} + + def clear_dynamic_shape(): + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if not dynamic_shape: + return 0, 3 + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + + def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + class TrtConvertElementwiseTest_one_input(TrtLayerAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: @@ -206,7 +301,7 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if self.dims == 1: + if self.dims == 1 and not dynamic_shape: return 0, 3 return 1, 2 @@ -244,10 +339,6 @@ class TrtConvertElementwiseTest_two_input_without_broadcast( TrtLayerAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: - inputs = program_config.inputs - if len(inputs['input_data1'].shape) == 1: - return False - return True def sample_program_configs(self): @@ -353,6 +444,11 @@ def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} self.dynamic_shape.opt_input_shape = {} + def generate_trt_nodes_num(attrs, dynamic_shape): + if self.dims == 1 and not dynamic_shape: + return 0, 4 + return 1, 3 + attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) ] @@ -360,9 +456,11 @@ def clear_dynamic_shape(): # for static_shape clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (1, 3), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (1, 3), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 # for dynamic_shape generate_dynamic_shape(attrs) @@ -519,15 +617,19 @@ def sample_program_configs(self): def generate_input(shape): return np.random.random(shape).astype(np.float32) + # use rand not randn to avoiding pow producing `NAN` def generate_weight(): - return np.random.randn(32).astype(np.float32) + return np.random.rand(32).astype(np.float32) for batch in [1, 2, 4]: for shape in [[32], [batch, 32], [batch, 32, 32], [batch, 32, 16, 32]]: for op_type in [ - "elementwise_add", "elementwise_mul", "elementwise_sub", - "elementwise_div", "elementwise_pow" + "elementwise_add", + "elementwise_mul", + "elementwise_sub", + "elementwise_div", + "elementwise_pow", ]: for axis in [-1 if len(shape) == 1 else 1]: self.dims = len(shape) @@ -595,11 +697,6 @@ def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} self.dynamic_shape.opt_input_shape = {} - def generate_trt_nodes_num(attrs, dynamic_shape): - if self.dims == 1: - return 0, 3 - return 1, 2 - attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) ] @@ -607,33 +704,19 @@ def generate_trt_nodes_num(attrs, dynamic_shape): # for static_shape clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), 1e-5 + yield self.create_inference_config(), (0, 3), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), 1e-5 + yield self.create_inference_config(), (0, 3), 1e-5 # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True), 1e-5 + yield self.create_inference_config(), (1, 2), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True), 1e-5 + yield self.create_inference_config(), (1, 2), 1e-5 def add_skip_trt_case(self): - - def teller1(program_config, predictor_config): - input_x_names = program_config.ops[0].inputs["X"] - for weight_name in program_config.weights: - if weight_name in input_x_names: - return True - return False - - self.add_skip_case( - teller1, SkipReasons.TRT_NOT_SUPPORT, - "Input X should not be parameters in elementwise op.") + pass def test(self): self.add_skip_trt_case() From aa71c5512502976073afc070fc9e1aa3edf03743 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Wed, 13 Jul 2022 10:26:22 +0800 Subject: [PATCH 159/250] add more interface for user in common_ops_import (#44270) --- python/paddle/common_ops_import.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/common_ops_import.py b/python/paddle/common_ops_import.py index de8056f280a39..d8fdac59df48f 100644 --- a/python/paddle/common_ops_import.py +++ b/python/paddle/common_ops_import.py @@ -14,7 +14,7 @@ from six.moves import reduce from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.param_attr import ParamAttr -from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode, _varbase_creator +from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode, _varbase_creator, in_dygraph_mode, _in_legacy_dygraph from paddle.fluid.framework import device_guard, default_main_program, dygraph_only, _dygraph_tracer from paddle.fluid.framework import OpProtoHolder, Variable from paddle.fluid.initializer import Constant From 469d5ab47a8ff08cba822f5c796bc93ab44b83e9 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Wed, 13 Jul 2022 10:26:38 +0800 Subject: [PATCH 160/250] fix bug of data transform on xpu (#44262) --- paddle/fluid/framework/operator.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 0fca87df34f5a..83521be98fc17 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2103,7 +2103,9 @@ Scope* OperatorWithKernel::PrepareData( auto tensor_backend = phi::TransToPhiBackend(tensor_in->place()); if ((in_def->backend != tensor_backend && (in_def->backend != phi::Backend::GPUDNN || - tensor_backend != phi::Backend::GPU)) || + tensor_backend != phi::Backend::GPU) && + (in_def->backend != phi::Backend::KPS || + tensor_backend != phi::Backend::XPU)) || tensor_in->place().GetType() == AllocationType::GPUPINNED) { new_expected_kernel_key = std::make_unique( expected_kernel_key.data_type_, From abc2cc5704178dfff968aae81ffaed1e2b67b992 Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Wed, 13 Jul 2022 10:32:15 +0800 Subject: [PATCH 161/250] fix transform data (#44266) * fix transform data * fix dropout kernel * Revert "fix transform data" This reverts commit ada75ecd169ea194ce43f7ed75dcc968f5ed2fb9. --- paddle/phi/kernels/gpu/dropout_kernel.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/gpu/dropout_kernel.cu b/paddle/phi/kernels/gpu/dropout_kernel.cu index 2fa3c7639e396..f973bb8e15fc7 100644 --- a/paddle/phi/kernels/gpu/dropout_kernel.cu +++ b/paddle/phi/kernels/gpu/dropout_kernel.cu @@ -84,7 +84,9 @@ PD_REGISTER_KERNEL(dropout, float, double, phi::dtype::bfloat16, - phi::dtype::float16) {} + phi::dtype::float16) { + kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(dropout_nd, GPU, @@ -93,4 +95,6 @@ PD_REGISTER_KERNEL(dropout_nd, float, double, phi::dtype::bfloat16, - phi::dtype::float16) {} + phi::dtype::float16) { + kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND); +} From bcf57274e225a7b59aee9918eafb2783c83916bd Mon Sep 17 00:00:00 2001 From: zmxdream Date: Wed, 13 Jul 2022 10:33:13 +0800 Subject: [PATCH 162/250] fix device optimizer config (#44282) --- paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index 92df8d8581a86..bb9998249048e 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -169,6 +169,7 @@ HashTable::HashTable(size_t capacity) { template HashTable::~HashTable() { delete container_; + cudaFree(device_optimizer_config_); } template From 153e030bea8b2159ad1a715ebf6a850bc28ce41f Mon Sep 17 00:00:00 2001 From: fuyou765 <64373205+fuyou765@users.noreply.github.com> Date: Wed, 13 Jul 2022 10:42:51 +0800 Subject: [PATCH 163/250] [MLU]add mlu kernel for reciprocal and reciprocal grad op (#43855) --- paddle/fluid/operators/activation_op_mlu.cc | 70 +++++++++++++++++++ .../unittests/mlu/test_reciprocal_op_mlu.py | 69 ++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_reciprocal_op_mlu.py diff --git a/paddle/fluid/operators/activation_op_mlu.cc b/paddle/fluid/operators/activation_op_mlu.cc index 6ba86351e6af5..d1087965f044e 100644 --- a/paddle/fluid/operators/activation_op_mlu.cc +++ b/paddle/fluid/operators/activation_op_mlu.cc @@ -399,11 +399,81 @@ class HardSigmoidGradMLUKernel : public framework::OpKernel { } }; +template +class ReciprocalMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto place = ctx.GetPlace(); + out->mutable_data(place); + MLUCnnlTensorDesc x_desc(*x); + MLUCnnlTensorDesc out_desc(*out); + MLUCnnl::Reciprocal( + ctx, x_desc.get(), GetBasePtr(x), out_desc.get(), GetBasePtr(out)); + } +}; + +template +class ReciprocalGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto place = ctx.GetPlace(); + dx->mutable_data(place); + Tensor square_out; + square_out.Resize(out->dims()); + square_out.mutable_data(place); + MLUCnnlTensorDesc out_desc(*out); + MLUCnnlTensorDesc dout_desc(*dout); + MLUCnnlTensorDesc dx_desc(*dx); + MLUCnnlTensorDesc square_out_desc(square_out); + MLUCnnl::Square(ctx, + out_desc.get(), + GetBasePtr(out), + square_out_desc.get(), + GetBasePtr(&square_out)); + cnnlOpTensorDesc_t op_tensor_op = CNNL_OP_TENSOR_MUL; + cnnlDataType_t op_tensor_comp_type = CNNL_DTYPE_FLOAT; + cnnlNanPropagation_t op_tensor_nan_opt = CNNL_NOT_PROPAGATE_NAN; + MLUCnnlOpTensorDesc op_tensor_desc( + op_tensor_op, op_tensor_comp_type, op_tensor_nan_opt); + float alpha1_float = -1; + float alpha2_float = 1; + float beta_float = 0; + MLUCnnl::OpTensor(ctx, + op_tensor_desc.get(), + dout_desc.get(), + GetBasePtr(dout), + square_out_desc.get(), + GetBasePtr(&square_out), + dx_desc.get(), + GetBasePtr(dx), + op_tensor_comp_type, + alpha1_float, + alpha2_float, + beta_float); + } +}; } // namespace operators } // namespace paddle namespace ops = paddle::operators; +// reciprocal +REGISTER_OP_MLU_KERNEL( + reciprocal, + ops::ReciprocalMLUKernel, + ops::ReciprocalMLUKernel); + +REGISTER_OP_MLU_KERNEL( + reciprocal_grad, + ops::ReciprocalGradMLUKernel, + ops::ReciprocalGradMLUKernel); // relu REGISTER_OP_MLU_KERNEL( relu, diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reciprocal_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reciprocal_op_mlu.py new file mode 100644 index 0000000000000..1791b1dab28b8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_reciprocal_op_mlu.py @@ -0,0 +1,69 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function, division + +import numpy as np +import unittest +import sys + +sys.path.append("..") +from op_test import OpTest, skip_check_grad_ci +import paddle + +paddle.enable_static() + + +class TestMLUReciprocal(OpTest): + + def setUp(self): + self.op_type = "reciprocal" + self.set_mlu() + self.init_dtype() + + np.random.seed(1024) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.reciprocal(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], + 'Out', + max_relative_error=0.01) + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + +class TestMLUReciprocalFp16(TestMLUReciprocal): + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float16 + + +if __name__ == '__main__': + unittest.main() From 20621c7bf052d5b8a4a1d5a151728e679d3482a5 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Wed, 13 Jul 2022 11:12:17 +0800 Subject: [PATCH 164/250] add dependency for fwd_func to avoid compiling error of random parallel compiling (#44277) --- .../api/manual/fluid_manual/forwards/CMakeLists.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt index 4912663ef1f54..295b8d9a6408f 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt @@ -3,21 +3,24 @@ cc_library( SRCS fused_gate_attention_fwd_func.cc DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) -add_dependencies(fused_gate_attention_fwd_func eager_codegen) +add_dependencies(fused_gate_attention_fwd_func eager_codegen + copy_dygraph_forward_functions) cc_library( fused_feedforward_fwd_func SRCS fused_feedforward_fwd_func.cc DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) -add_dependencies(fused_feedforward_fwd_func eager_codegen) +add_dependencies(fused_feedforward_fwd_func eager_codegen + copy_dygraph_forward_functions) cc_library( fused_attention_fwd_func SRCS fused_attention_fwd_func.cc DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) -add_dependencies(fused_attention_fwd_func eager_codegen) +add_dependencies(fused_attention_fwd_func eager_codegen + copy_dygraph_forward_functions) set(fluid_manual_functions fused_gate_attention_fwd_func fused_feedforward_fwd_func From 8761982986db2e382382637902a87c82b6213d48 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Wed, 13 Jul 2022 11:12:54 +0800 Subject: [PATCH 165/250] [Eager] concat_double_grad fill zero for empty grads (#44252) --- .../auto_code_generator/final_state_generator/codegen_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index dee3b3d79a2e7..cd5805740bef0 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -32,7 +32,7 @@ "square_double_grad", "celu_double_grad", "pad_double_grad", "pad3d_double_grad", "squeeze_double_grad", "unsqueeze_double_grad", "instance_norm_double_grad", "conv3d_double_grad", - "depthwise_conv2d_grad_grad" + "depthwise_conv2d_grad_grad", "concat_double_grad" ]) # For API dispatch used at python-level From d6d60cbc8d7ded1a8b348dd30ff29d5830098aff Mon Sep 17 00:00:00 2001 From: QingshuChen Date: Wed, 13 Jul 2022 11:15:46 +0800 Subject: [PATCH 166/250] =?UTF-8?q?fix=20cpu=20lars=5Fmomentum=20bug=20&?= =?UTF-8?q?=20add=20xpu=20grad=5Fadd/log=5Fsoftmax/log=5Fsoftmax=5F?= =?UTF-8?q?=E2=80=A6=20(#44260)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix cpu lars_momentum bug & add xpu grad_add/log_softmax/log_softmax_grad *test=kunlun * minor *test=kunlun --- .../operators/optimizers/lars_momentum_op.h | 6 +- .../fluid/platform/device/xpu/xpu2_op_list.h | 4 + .../phi/kernels/xpu/elementwise_add_kernel.cc | 41 +++++++ .../kernels/xpu/log_softmax_grad_kernel.cc | 68 +++++++++++ paddle/phi/kernels/xpu/log_softmax_kernel.cc | 47 ++++++++ .../unittests/xpu/get_test_cover_info.py | 5 +- .../unittests/xpu/test_log_softmax_op_xpu.py | 107 ++++++++++++++++++ 7 files changed, 275 insertions(+), 3 deletions(-) create mode 100644 paddle/phi/kernels/xpu/elementwise_add_kernel.cc create mode 100644 paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc create mode 100644 paddle/phi/kernels/xpu/log_softmax_kernel.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h index df4d7b9a0438b..459900b14f61d 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.h +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h @@ -33,6 +33,7 @@ class LarsMomentumOpKernel : public framework::OpKernel { T mu = static_cast(ctx.Attr("mu")); T lars_coeff = ctx.Attr("lars_coeff"); T epsilon = ctx.Attr("epsilon"); + T rescale_grad = ctx.Attr("rescale_grad"); int op_num = param.size(); for (int i = 0; i < op_num; ++i) { @@ -46,6 +47,7 @@ class LarsMomentumOpKernel : public framework::OpKernel { auto p = framework::EigenVector::Flatten(*(param[i])); auto v = framework::EigenVector::Flatten(*(velocity[i])); auto g = framework::EigenVector::Flatten(*(grad[i])); + auto rescale_g = rescale_grad * g; framework::Tensor p_norm_t, g_norm_t; p_norm_t.Resize({1}); @@ -55,14 +57,14 @@ class LarsMomentumOpKernel : public framework::OpKernel { auto ep_norm = framework::EigenScalar::From(p_norm_t); auto eg_norm = framework::EigenScalar::From(g_norm_t); ep_norm = p.square().sum().sqrt(); - eg_norm = g.square().sum().sqrt(); + eg_norm = rescale_g.square().sum().sqrt(); T local_lr = lr[0]; if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) { local_lr = lr[0] * lars_coeff * ep_norm(0) / (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon); } - v_out = v * mu + local_lr * (g + lars_weight_decay * p); + v_out = v * mu + local_lr * (rescale_g + lars_weight_decay * p); p_out = p - v_out; } } diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 7e9c61289b67f..9f07f05ff7fa6 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -232,6 +232,7 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"generate_proposals_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"grad_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"greater_equal", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace()), @@ -274,6 +275,9 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"log_softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"log_softmax_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"lookup_table_v2_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"lookup_table_v2", diff --git a/paddle/phi/kernels/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc new file mode 100644 index 0000000000000..34d39b0a83da2 --- /dev/null +++ b/paddle/phi/kernels/xpu/elementwise_add_kernel.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/api/ext/dispatch.h" +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h" + +namespace phi { + +template +void GradAddXPUKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + dev_ctx.template Alloc(out); + auto x_shape = phi::vectorize(x.dims()); + auto y_shape = phi::vectorize(y.dims()); + int r = xpu::broadcast_add(dev_ctx.x_context(), + x.data(), + y.data(), + out->data(), + x_shape, + y_shape); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add"); +} + +} // namespace phi + +PD_REGISTER_KERNEL(grad_add, XPU, ALL_LAYOUT, phi::GradAddXPUKernel, float) {} diff --git a/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc new file mode 100644 index 0000000000000..c9165f3ef7d7e --- /dev/null +++ b/paddle/phi/kernels/xpu/log_softmax_grad_kernel.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_grad_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" + +namespace phi { + +template +void LogSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + const int rank = out.dims().size(); + axis = funcs::CanonicalAxis(axis, rank); + + if (out.numel() != 0) { + auto out_shape = phi::vectorize(out.dims()); + dev_ctx.template Alloc(x_grad); + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + T* tmp_ptr = RAII_GUARD.alloc_l3_or_gm(out_grad.numel()); + T* tmp2_ptr = RAII_GUARD.alloc_l3_or_gm(out_grad.numel()); + PADDLE_ENFORCE_NE( + tmp_ptr, nullptr, phi::errors::External("no enough memory in xpu")); + PADDLE_ENFORCE_NE( + tmp2_ptr, nullptr, phi::errors::External("no enough memory in xpu")); + + int r = + xpu::exp(dev_ctx.x_context(), out.data(), tmp_ptr, out_grad.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "exp"); + r = xpu::reciprocal( + dev_ctx.x_context(), tmp_ptr, tmp2_ptr, out_grad.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reciprocal"); + r = xpu::mul(dev_ctx.x_context(), + tmp2_ptr, + out_grad.data(), + tmp2_ptr, + out_grad.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul"); + r = xpu::softmax_grad(dev_ctx.x_context(), + tmp_ptr, + tmp2_ptr, + x_grad->data(), + out_shape, + axis); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad"); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + log_softmax_grad, XPU, ALL_LAYOUT, phi::LogSoftmaxGradKernel, float) {} diff --git a/paddle/phi/kernels/xpu/log_softmax_kernel.cc b/paddle/phi/kernels/xpu/log_softmax_kernel.cc new file mode 100644 index 0000000000000..1f084d0e6cbf7 --- /dev/null +++ b/paddle/phi/kernels/xpu/log_softmax_kernel.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_kernel.h" + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" + +namespace phi { + +template +void LogSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { + const int rank = x.dims().size(); + axis = funcs::CanonicalAxis(axis, rank); + + if (x.numel() != 0) { + auto x_shape = phi::vectorize(x.dims()); + dev_ctx.template Alloc(out); + if (axis < 0) axis += rank; + int r = xpu::softmax( + dev_ctx.x_context(), x.data(), out->data(), x_shape, axis); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax"); + r = xpu::log( + dev_ctx.x_context(), out->data(), out->data(), out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "log"); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(log_softmax, XPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float) { +} diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py index b4032f2dcb67e..3da9e32b015ed 100644 --- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py +++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py @@ -85,7 +85,10 @@ xpu_test_op_white_list = [] xpu_test_device_type_white_list = ['xpu1_float64'] xpu_test_op_type_white_list = [ - 'dropout_float16', 'dropout_grad_float16', 'matmul_v2_float16' + 'dropout_float16', + 'dropout_grad_float16', + 'matmul_v2_float16', + "grad_add_float32" # no api for grad_add, skip ] xpu_test_device_op_white_list = [] xpu_test_device_op_type_white_list = [] diff --git a/python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py new file mode 100644 index 0000000000000..e7e730d9b2e25 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py @@ -0,0 +1,107 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys + +sys.path.append("..") +from op_test import OpTest + +import paddle +import paddle.fluid.core as core +import paddle.nn.functional as F + +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() +np.random.seed(10) + + +def ref_log_softmax(x): + shiftx = (x - np.max(x)) + out = shiftx - np.log(np.exp(shiftx).sum()) + return out + + +def ref_log_softmax_grad(x, axis): + if axis < 0: + axis += len(x.shape) + out = np.apply_along_axis(ref_log_softmax, axis, x) + axis_dim = x.shape[axis] + dout = np.full_like(x, fill_value=1. / x.size) + dx = dout - np.exp(out) * dout.copy().sum(axis=axis, keepdims=True).repeat( + axis_dim, axis=axis) + return dx + + +class XPUTestLogSoftmaxOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'log_softmax' + self.use_dynamic_create_class = True + + def dynamic_create_class(self): + base_class = self.TestXPULogSoftmaxOp + classes = [] + axis_arr = [-1, 1] + shape_arr = [[2, 3, 4, 5], [12, 10], [2, 5], [7, 7], [3, 5, 7]] + for axis in axis_arr: + for shape in shape_arr: + class_name = 'XPUTestLogSoftmax_' + \ + str(axis) + "_" + str(shape) + attr_dict = {'axis': axis, 'shape': shape} + classes.append([class_name, attr_dict]) + return base_class, classes + + class TestXPULogSoftmaxOp(XPUOpTest): + + def setUp(self): + self.op_type = 'log_softmax' + self.python_api = F.log_softmax + self.dtype = 'float32' + self.set_attrs() + self.use_xpu = True + if not hasattr(self, 'axis'): + self.shape = [2, 3, 4, 5] + self.axis = -1 + + x = np.random.uniform(0.1, 1., self.shape).astype(self.dtype) + out = np.apply_along_axis(ref_log_softmax, self.axis, x) + self.x_grad = ref_log_softmax_grad(x, self.axis) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'axis': self.axis} + + def set_attrs(self): + pass + + def test_check_output(self): + self.check_output(check_eager=True) + + def test_check_grad(self): + self.check_grad(['X'], ['Out'], + user_defined_grads=[self.x_grad], + check_eager=True) + + +support_types = get_xpu_op_support_types('log_softmax') +for stype in support_types: + create_test_class(globals(), XPUTestLogSoftmaxOp, stype) + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() From 07c729aa6e1183afc7e5ff113f07b6a6ae98b79c Mon Sep 17 00:00:00 2001 From: Jiabin Yang <360788950@qq.com> Date: Wed, 13 Jul 2022 11:16:04 +0800 Subject: [PATCH 167/250] [Eager] Fix sharding in eager (#44271) * fix sharding in eager * support eager sharding --- .../fleet/meta_parallel/sharding/group_sharded_stage2.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py index 39e92f8878028..f13739960b38a 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py @@ -210,9 +210,10 @@ def _grad_scale(self): scale=self._world_size_scaling) # Scale grads of params - for param in self._trainable_params: - if param.name in self._param_grads and param.grad is not None: - param.grad.scale_(scale=self._world_size_scaling) + with paddle.no_grad(): + for param in self._trainable_params: + if param.name in self._param_grads and param.grad is not None: + param.grad.scale_(scale=self._world_size_scaling) # param._reset_grad_inplace_version(True) # Scale grads of master params with offload strategy From 961d6cce06c62d1a342ebd6d7646c52c45357614 Mon Sep 17 00:00:00 2001 From: caozhou <48191911+Caozhou1995@users.noreply.github.com> Date: Wed, 13 Jul 2022 11:17:56 +0800 Subject: [PATCH 168/250] [Auto Parallel]Generate default cluster (#44150) * generate default cluster * add unittest --- .../distributed/auto_parallel/cluster.py | 217 +++++++++++++++++- .../unittests/auto_parallel/test_cluster.py | 16 ++ 2 files changed, 228 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/cluster.py b/python/paddle/distributed/auto_parallel/cluster.py index e70b29dbe3931..e17f83eb41907 100644 --- a/python/paddle/distributed/auto_parallel/cluster.py +++ b/python/paddle/distributed/auto_parallel/cluster.py @@ -16,6 +16,7 @@ import json from enum import IntEnum from enum import unique +import paddle @unique @@ -138,7 +139,7 @@ def __repr__(self): class Link: default_hop = 1 - default_nic_bandwith = 24 + default_nic_bandwidth = 24 def __init__(self, source, target): self._src = source @@ -411,6 +412,174 @@ def __init__(self): self._alpha_latency = None self._rank_to_device_id = {} self._device_id_to_rank = {} + # This property only be valid when the cluster consists of machines, + # which have the same number accelerators. + self._num_devices_per_machine = None + + def gen_default_config_cluster(self, + gpu_model="V100", + cpu_model="6271C", + node_count=1, + device_count=1, + gpu_memory=32, + cpu_memory=503, + inter_bandwidth=24, + intra_bandwidth=235, + gpu_dp_gflops=7800, + gpu_sp_gflops=15700, + cpu_dp_gflops=75, + cpu_sp_gflops=150): + """Generate cluster by default config.""" + gpu_models = ["V100", "A100", "H100", "A2", "A10", "A16", "A30", "A40"] + xpu_models = ["XPU"] + npu_models = ["NPU"] + dcu_models = ["DCU"] + all_gpu_models = gpu_models + xpu_models + npu_models + dcu_models + assert gpu_model in all_gpu_models + self._num_devices_per_machine = device_count + + def _convert_to_type(gpu_model): + type = None + if gpu_model in gpu_models: + type = "GPU" + elif gpu_model in xpu_models: + type = "XPU" + elif gpu_model in npu_models: + type = "NPU" + elif gpu_model in dcu_models: + type = "DCU" + assert type is not None + + return type + + def _convert_to_model(gpu_model, gpu_memory): + model = None + if gpu_model == "V100": + model = "Tesla V100-SXM2-" + str(gpu_memory) + "GB" + assert model is not None + + return model + + def _convert_to_cpu_info(cpu_model): + arch, vendor, model = None, None, None + if cpu_model == "6271C": + arch = "x86_64" + vendor = "GenuineIntel" + model = "Intel(R) Xeon(R) Gold 6271C CPU @ 2.60G" + elif cpu_model == "6148": + arch = "x86_64" + vendor = "GenuineIntel" + model = "Intel(R) Xeon(R) Gold 6148 CPU @ 2.40G" + assert arch is not None + assert vendor is not None + assert model is not None + + return arch, vendor, model + + cluster_info = {} + cluster_info["machines"] = [] + global_id = 0 + global_id_to_device_type = {} + global_id_to_node = {} + # NOTE: It will support NPU, XPU, DCU models in the future, it is just a fake value now + for i in range(node_count): + machine = {} + # NOTE: The hostname is host_0, host_1, ... + machine["hostname"] = "host_" + str(i) + # NOTE: The addr is localhost, if need actual addr, it should be reset manually + machine["addr"] = "127.0.0.1" + # NOTE: The port is a default value + machine["port"] = 60009 + machine["links"] = [] + + devices = [] + local_id = 0 + + for j in range(device_count): + device = {} + global_id = global_id if i == 0 and j == 0 else global_id + 1 + + local_id += 1 + type = _convert_to_type(gpu_model) + model = _convert_to_model(gpu_model, gpu_memory) + dp_gflops = gpu_dp_gflops + sp_gflops = gpu_dp_gflops + memory = gpu_memory + + device["global_id"] = global_id + device["local_id"] = local_id + device["type"] = type + device["model"] = model + device["memory"] = memory + device["sp_gflops"] = sp_gflops + device["dp_gflops"] = dp_gflops + global_id_to_device_type[global_id] = type + global_id_to_node[global_id] = i + devices.append(device) + + # add cpu device and nic device, just one cpu + cpu_device = {} + arch, vendor, model = _convert_to_cpu_info(cpu_model) + sp_gflops = cpu_sp_gflops + dp_gflops = cpu_dp_gflops + global_id += 1 + local_id = 0 + memory = cpu_memory + type = "CPU" + cpu_device["arch"] = arch + cpu_device["vendor"] = vendor + cpu_device["model"] = model + cpu_device["sp_gflops"] = sp_gflops + cpu_device["dp_gflops"] = dp_gflops + cpu_device["global_id"] = global_id + cpu_device["local_id"] = local_id + cpu_device["memory"] = memory + cpu_device["type"] = type + global_id_to_node[global_id] = i + global_id_to_device_type[global_id] = type + devices.append(cpu_device) + + nic_device = {} + global_id += 1 + + # add NIC + type = "NIC" + width = 12.5 + ip = "127.0.0.1" + local_id = 0 + nic_device["type"] = type + nic_device["local_id"] = type + nic_device["global_id"] = global_id + global_id_to_device_type[global_id] = type + global_id_to_node[global_id] = i + devices.append(nic_device) + machine["devices"] = devices + cluster_info["machines"].append(machine) + + # build link + for i in range(0, global_id + 1): + for j in range(0, global_id + 1): + if i == j: + continue + node_id_i = global_id_to_node[i] + node_id_j = global_id_to_node[j] + device_type_i = global_id_to_device_type[i] + device_type_j = global_id_to_device_type[j] + link = {} + source_global_id = i + target_global_id = j + link["source_global_id"] = source_global_id + link["target_global_id"] = target_global_id + # the same node and device_type, set intra_bandwidth, NVL + if node_id_i == node_id_j and device_type_i == device_type_j: + link["type"] = "NVL" + link["bandwidth"] = intra_bandwidth + else: + link["type"] = "PHB" + link["bandwidth"] = inter_bandwidth + cluster_info["machines"][node_id_i]["links"].append(link) + + self._build_from_dict(cluster_info) @property def rank_to_device_id(self): @@ -473,9 +642,7 @@ def get_device(self, device_global_id): device = machine.devices[device_global_id] return device - def build_from_file(self, json_file_path): - with open(json_file_path) as json_file: - cluster_info = json.load(json_file) + def _build_from_dict(self, cluster_info): machines_info = cluster_info["machines"] for machine_info in machines_info: machine_id = self._generate_machine_id() @@ -533,6 +700,11 @@ def build_from_file(self, json_file_path): else: self._alpha_latecy = None + def build_from_file(self, json_file_path): + with open(json_file_path) as json_file: + cluster_info = json.load(json_file) + self._build_from_dict(cluster_info) + def _generate_machine_id(self): cur_machine_id = self._num_machines self._num_machines += 1 @@ -556,7 +728,7 @@ def get_beta(self, source_device_id, target_device_id): bandwidth = None # None means the source and target are not connected directly, set NIC in default if link is None: - bandwidth = Link.default_nic_bandwith + bandwidth = Link.default_nic_bandwidth else: bandwidth = link.bandwidth @@ -608,6 +780,15 @@ def get_involved_machine_count(self, device_ids): assert count > 0 return count + def get_num_machines(self): + return len(self._machines) + + def get_num_devices_per_machine(self): + # Only return the number of accelerators of each machine. + # All machines must has the same number of devices and same type of devices. + assert self._num_devices_per_machine + return self._num_devices_per_machine + def __str__(self): str = "" for machine in self.machines.values(): @@ -616,3 +797,29 @@ def __str__(self): def __repr__(self): return self.__str__() + + +def get_default_cluster(): + cluster = Cluster() + local_device_count = os.getenv("PADDLE_LOCAL_SIZE") + if local_device_count is None: + local_device_count = 1 + else: + local_device_count = int(local_device_count) + global_device_count = os.getenv("PADDLE_GLOBAL_SIZE") + if global_device_count is None: + node_count = 1 + else: + global_device_count = int(global_device_count) + assert global_device_count % local_device_count == 0 + node_count = int(global_device_count) // local_device_count + print("Node Count: ", + node_count, + "Local Device Size: ", + local_device_count, + "World size: ", + paddle.distributed.get_world_size(), + flush=True) + cluster.gen_default_config_cluster(node_count=node_count, + device_count=local_device_count) + return cluster diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py index 641ca38b64944..2fa01bdfa6a59 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_cluster.py @@ -19,6 +19,7 @@ import paddle from paddle.distributed.auto_parallel.cluster import Cluster +from paddle.distributed.auto_parallel.cluster import get_default_cluster cluster_json = """ { @@ -1997,6 +1998,10 @@ def test_single_machine(self): self.assertTrue(devices == [0, 1, 2, 3]) self.assertTrue(involved_machine_count == 1) + # Remove unnecessary files + if os.path.exists(cluster_json_path): + os.remove(cluster_json_path) + def test_multi_machine(self): # Build cluster cluster_json_path = os.path.join(self.temp_dir.name, @@ -2022,6 +2027,17 @@ def test_multi_machine(self): if os.path.exists(cluster_json_path): os.remove(cluster_json_path) + def test_default_config_cluster(self): + cluster = Cluster() + cluster.gen_default_config_cluster(device_count=8) + # check machines and devices + self.assertTrue(cluster.get_num_machines() == 1) + self.assertTrue(cluster.get_num_devices_per_machine() == 8) + + def test_default_cluster(self): + cluster = get_default_cluster() + self.assertTrue(isinstance(cluster, Cluster)) + if __name__ == "__main__": unittest.main() From 033ef5e94f01530027edbc666717b62814229f56 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Wed, 13 Jul 2022 11:33:20 +0800 Subject: [PATCH 169/250] [CustomKernel] capi add eager mode support (#44164) * [CustomKernel] add capi eager mode support * add ut * add capi test --- paddle/phi/backends/custom/CMakeLists.txt | 4 + paddle/phi/backends/custom/capi_test.cc | 78 +++++ paddle/phi/capi/include/c_kernel_context.h | 20 ++ paddle/phi/capi/include/c_tensor.h | 4 + paddle/phi/capi/include/kernel_registry.h | 124 ++++++- paddle/phi/capi/include/kernel_utils.h | 329 ++++++++++++------ paddle/phi/capi/lib/c_kernel_context.cc | 85 +++++ paddle/phi/capi/lib/c_tensor.cc | 15 + .../fluid/tests/custom_runtime/CMakeLists.txt | 3 +- .../custom_runtime/custom_cpu_runtime.cc | 215 ------------ .../tests/custom_runtime/custom_cpu_setup.py | 82 ----- .../custom_runtime/test_custom_cpu_plugin.py | 131 +++++++ .../test_custom_device_data_loader.py | 66 ---- 13 files changed, 675 insertions(+), 481 deletions(-) create mode 100644 paddle/phi/backends/custom/capi_test.cc delete mode 100644 python/paddle/fluid/tests/custom_runtime/custom_cpu_runtime.cc delete mode 100644 python/paddle/fluid/tests/custom_runtime/custom_cpu_setup.py create mode 100644 python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py delete mode 100644 python/paddle/fluid/tests/custom_runtime/test_custom_device_data_loader.py diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt index d8ed6706eba22..ceff429f8e596 100644 --- a/paddle/phi/backends/custom/CMakeLists.txt +++ b/paddle/phi/backends/custom/CMakeLists.txt @@ -11,4 +11,8 @@ if(WITH_CUSTOM_DEVICE) custom_device_test SRCS custom_device_test.cc DEPS device_manager device_context) + cc_test( + capi_test + SRCS capi_test.cc + DEPS phi_capi) endif() diff --git a/paddle/phi/backends/custom/capi_test.cc b/paddle/phi/backends/custom/capi_test.cc new file mode 100644 index 0000000000000..90b01d0e36021 --- /dev/null +++ b/paddle/phi/backends/custom/capi_test.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "paddle/phi/capi/all.h" + +#ifndef UNUSED +#define UNUSED __attribute__((unused)) +#endif + +#include "paddle/phi/capi/capi.h" + +TEST(CustomKernel, CAPI) { + std::string str = "capi"; + EXPECT_EQ(str.data(), PD_StringAttr(&str)); + + std::vector int32_vec({1, 2, 3}); + auto int32_list = PD_ListInt32Attr(&int32_vec); + EXPECT_EQ(int32_list.data, int32_vec.data()); + EXPECT_EQ(int32_list.size, int32_vec.size()); + + std::vector int64_vec({1, 2, 3}); + auto int64_list = PD_ListInt64Attr(&int64_vec); + EXPECT_EQ(int64_list.data, int64_vec.data()); + EXPECT_EQ(int64_list.size, int64_vec.size()); + + std::vector float_vec({1, 2, 3}); + auto float_list = PD_ListFloatAttr(&float_vec); + EXPECT_EQ(float_list.data, float_vec.data()); + EXPECT_EQ(float_list.size, float_vec.size()); + + std::vector double_vec({1, 2, 3}); + auto double_list = PD_ListDoubleAttr(&double_vec); + EXPECT_EQ(double_list.data, double_vec.data()); + EXPECT_EQ(double_list.size, double_vec.size()); + + std::vector string_vec{"capi", "api"}; + auto string_list = PD_ListStringAttr(&string_vec); + auto string_data = reinterpret_cast(string_list.data); + for (size_t i = 0; i < string_vec.size(); ++i) { + EXPECT_EQ(string_data[i], string_vec[i].data()); + } + + std::vector bool_vec{true, false, true}; + auto bool_list = PD_ListBoolAttr(&bool_vec); + auto bool_data = reinterpret_cast(bool_list.data); + for (size_t i = 0; i < bool_vec.size(); ++i) { + EXPECT_EQ(bool_data[i], static_cast(bool_vec[i])); + } + + std::vector ptr_vec; + for (size_t i = 0; i < float_vec.size(); ++i) { + ptr_vec.push_back(&float_vec[i]); + } + auto ptr_list = PD_TensorVectorToList(reinterpret_cast(&ptr_vec)); + EXPECT_EQ(ptr_list.data, ptr_vec.data()); + EXPECT_EQ(ptr_list.size, ptr_vec.size()); +} + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/paddle/phi/capi/include/c_kernel_context.h b/paddle/phi/capi/include/c_kernel_context.h index c06cb3cd30086..a5524e3aee278 100644 --- a/paddle/phi/capi/include/c_kernel_context.h +++ b/paddle/phi/capi/include/c_kernel_context.h @@ -87,6 +87,26 @@ PD_List PD_KernelContextListScalarAttrAt(PD_KernelContext *ctx, size_t index); PD_Place *PD_KernelContextPlaceAttrAt(PD_KernelContext *ctx, size_t index); +const char *PD_StringAttr(void *attr); + +PD_DataType PD_DatatTypeAttr(void *attr); + +PD_DataLayout PD_DatatLayoutAttr(void *attr); + +PD_List PD_ListInt32Attr(void *attr); + +PD_List PD_ListInt64Attr(void *attr); + +PD_List PD_ListFloatAttr(void *attr); + +PD_List PD_ListDoubleAttr(void *attr); + +PD_List PD_ListScalarAttr(void *attr); + +PD_List PD_ListStringAttr(void *attr); + +PD_List PD_ListBoolAttr(void *attr); + #ifdef __cplusplus } // extern "C" #endif diff --git a/paddle/phi/capi/include/c_tensor.h b/paddle/phi/capi/include/c_tensor.h index 494346713cf53..35ac7dda3964d 100644 --- a/paddle/phi/capi/include/c_tensor.h +++ b/paddle/phi/capi/include/c_tensor.h @@ -82,6 +82,10 @@ void PD_TensorShareLoDWith(PD_Tensor *dst, const PD_Tensor *src, PD_Status *status); +PD_Tensor *PD_OptionalTensorGetPointer(PD_Tensor *tensor); + +PD_List PD_TensorVectorToList(PD_Tensor *tensor); + #ifdef __cplusplus } // extern "C" #endif diff --git a/paddle/phi/capi/include/kernel_registry.h b/paddle/phi/capi/include/kernel_registry.h index 37b045a60658b..47ddc0bf5be7e 100644 --- a/paddle/phi/capi/include/kernel_registry.h +++ b/paddle/phi/capi/include/kernel_registry.h @@ -19,7 +19,129 @@ namespace phi { namespace capi { +// eager mode +inline std::vector PD_TensorVector(PD_Tensor *tensor) { + std::vector ret; + auto list = PD_TensorVectorToList(tensor); + auto data = reinterpret_cast(list.data); + for (size_t i = 0; i < list.size; ++i) { + ret.emplace_back(data[i]); + } + return ret; +} + +inline paddle::optional PD_OptionalTensor( + PD_Tensor *tensor) { + auto ptr = PD_OptionalTensorGetPointer(tensor); + return ptr ? paddle::optional( + phi::capi::DenseTensor(ptr)) + : paddle::optional(paddle::none); +} + +template +inline T PD_Attr(void *attr) { + return *reinterpret_cast(attr); +} + +template <> +inline std::string PD_Attr(void *attr) { + return PD_StringAttr(attr); +} + +template <> +inline PD_DataType PD_Attr(void *attr) { + return PD_DatatTypeAttr(attr); +} + +template <> +inline PD_DataLayout PD_Attr(void *attr) { + return PD_DatatLayoutAttr(attr); +} + +template <> +inline std::vector PD_Attr>(void *attr) { + auto list = PD_ListInt32Attr(attr); + auto data = reinterpret_cast(list.data); + std::vector cc_list(data, data + list.size); + return cc_list; +} + +template <> +inline std::vector PD_Attr>(void *attr) { + auto list = PD_ListInt64Attr(attr); + auto data = reinterpret_cast(list.data); + std::vector cc_list(data, data + list.size); + return cc_list; +} + +template <> +inline std::vector PD_Attr>(void *attr) { + auto list = PD_ListFloatAttr(attr); + auto data = reinterpret_cast(list.data); + std::vector cc_list(data, data + list.size); + return cc_list; +} + +template <> +inline std::vector PD_Attr>(void *attr) { + auto list = PD_ListDoubleAttr(attr); + auto data = reinterpret_cast(list.data); + std::vector cc_list(data, data + list.size); + return cc_list; +} + +template <> +inline phi::capi::Scalar PD_Attr(void *attr) { + return phi::capi::Scalar(reinterpret_cast(attr)); +} + +template <> +inline phi::capi::IntArray PD_Attr(void *attr) { + return phi::capi::IntArray(reinterpret_cast(attr)); +} + +template <> +inline phi::capi::Place PD_Attr(void *attr) { + return phi::capi::Place(reinterpret_cast(attr)); +} + +template <> +inline std::vector PD_Attr>( + void *attr) { + auto c_list = PD_ListScalarAttr(attr); + auto data = reinterpret_cast(c_list.data); + std::vector list; + for (size_t i = 0; i < c_list.size; ++i) { + list.emplace_back(data[i]); + } + PD_DeletePointerList(c_list); + return list; +} +template <> +inline std::vector PD_Attr>(void *attr) { + auto c_list = PD_ListStringAttr(attr); + auto data = reinterpret_cast(c_list.data); + std::vector list; + for (size_t i = 0; i < c_list.size; ++i) { + list.emplace_back(data[i]); + } + PD_DeletePointerList(c_list); + return list; +} + +template <> +inline std::vector PD_Attr>(void *attr) { + auto c_list = PD_ListBoolAttr(attr); + std::vector list; + auto data = reinterpret_cast(c_list.data); + for (size_t i = 0; i < c_list.size; ++i) { + list[i] = static_cast(data[i]); + } + PD_DeleteUInt8List(c_list); + return list; +} +// inline phi::capi::DeviceContext PD_GetDeviceContext(PD_KernelContext *ctx) { return phi::capi::DeviceContext(PD_KernelContextGetDeviceContext(ctx)); } @@ -189,7 +311,7 @@ inline std::vector PD_AttrAt>( template <> inline std::vector PD_AttrAt>( PD_KernelContext *ctx, size_t index) { - auto c_list = PD_KernelContextListScalarAttrAt(ctx, index); + auto c_list = PD_KernelContextListStringAttrAt(ctx, index); auto data = reinterpret_cast(c_list.data); std::vector list; for (size_t i = 0; i < c_list.size; ++i) { diff --git a/paddle/phi/capi/include/kernel_utils.h b/paddle/phi/capi/include/kernel_utils.h index 7302e6f4677b3..246bc9e3c5932 100644 --- a/paddle/phi/capi/include/kernel_utils.h +++ b/paddle/phi/capi/include/kernel_utils.h @@ -454,47 +454,67 @@ namespace capi { meta_kernel_fn, \ __VA_ARGS__)) -#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx) \ +#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx) \ + template \ + struct CustomKernelCallHelper { \ + template \ + static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ + static_assert(in_idx == 0, \ + "Kernel's DeviceContext should appear before Inputs."); \ + static_assert( \ + attr_idx == 0, \ + "Kernel's DeviceContext should appear before Attributes."); \ + static_assert(out_idx == 0, \ + "Kernel's DeviceContext should appear before Outputs."); \ + dev_ctx arg = PD_GetDeviceContext(ctx); \ + CustomKernelCallHelper:: \ + template Compute( \ + ctx, pargs..., arg); \ + } \ + template \ + static void VariadicCompute(const std::tuple &ctx, \ + PreviousArgs &...pargs) { \ + const dev_ctx &arg = std::get(ctx); \ + auto dev_ctx_wrapper = phi::capi::DeviceContext( \ + reinterpret_cast(const_cast(&arg))); \ + return CustomKernelCallHelper::template VariadicCompute( \ + ctx, pargs..., dev_ctx_wrapper); \ + } \ + } + +#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_INPUT(tensor_type) \ template \ - struct CustomKernelCallHelper { \ + struct CustomKernelCallHelper { \ template \ static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ - static_assert(in_idx == 0, \ - "Kernel's DeviceContext should appear before Inputs."); \ - static_assert( \ - attr_idx == 0, \ - "Kernel's DeviceContext should appear before Attributes."); \ + static_assert(attr_idx == 0, \ + "Kernel's Input should appear before Attributes."); \ static_assert(out_idx == 0, \ - "Kernel's DeviceContext should appear before Outputs."); \ - dev_ctx arg = PD_GetDeviceContext(ctx); \ + "Kernel's Input should appear before Outputs."); \ + const tensor_type arg = PD_InputAt(ctx, in_idx); \ CustomKernelCallHelper:: \ - template Compute( \ + template Compute( \ ctx, pargs..., arg); \ } \ - } - -#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_INPUT(tensor_type) \ - template \ - struct CustomKernelCallHelper { \ - template \ - static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ - static_assert(attr_idx == 0, \ - "Kernel's Input should appear before Attributes."); \ - static_assert(out_idx == 0, \ - "Kernel's Input should appear before Outputs."); \ - const tensor_type arg = PD_InputAt(ctx, in_idx); \ - CustomKernelCallHelper:: \ - template Compute( \ - ctx, pargs..., arg); \ - } \ + template \ + static void VariadicCompute(const std::tuple &ctx, \ + PreviousArgs &...pargs) { \ + const tensor_type &arg = std::get(ctx); \ + auto tensor = phi::capi::DenseTensor( \ + reinterpret_cast(const_cast(&arg))); \ + return CustomKernelCallHelper::template VariadicCompute( \ + ctx, pargs..., tensor); \ + } \ } #define PD_SPECIALIZE_CustomKernelCallHelper_FOR_OPTIONAL_INPUT(tensor_type) \ @@ -516,99 +536,168 @@ namespace capi { template Compute( \ ctx, pargs..., arg); \ } \ + template \ + static void VariadicCompute(const std::tuple &ctx, \ + PreviousArgs &...pargs) { \ + auto &arg = std::get(ctx); \ + paddle::optional tensor = \ + PD_OptionalTensor(reinterpret_cast( \ + const_cast *>(&arg))); \ + return CustomKernelCallHelper::template VariadicCompute( \ + ctx, pargs..., tensor); \ + } \ } -#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ - template \ - struct CustomKernelCallHelper &, \ - Tail...> { \ - template \ - static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ - static_assert(attr_idx == 0, \ - "Kernel's Input should appear before Attributes."); \ - static_assert(out_idx == 0, \ - "Kernel's Input should appear before Outputs."); \ - auto arg = PD_MultiInputAt(ctx, in_idx); \ - auto arg_wrapper = PD_GetPointerVector(&arg); \ - CustomKernelCallHelper:: \ - template Compute( \ - ctx, pargs..., arg_wrapper); \ - } \ +#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ + template \ + struct CustomKernelCallHelper &, \ + Tail...> { \ + template \ + static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ + static_assert(attr_idx == 0, \ + "Kernel's Input should appear before Attributes."); \ + static_assert(out_idx == 0, \ + "Kernel's Input should appear before Outputs."); \ + auto arg = PD_MultiInputAt(ctx, in_idx); \ + auto arg_wrapper = PD_GetPointerVector(&arg); \ + CustomKernelCallHelper:: \ + template Compute( \ + ctx, pargs..., arg_wrapper); \ + } \ + template \ + static void VariadicCompute(const std::tuple &ctx, \ + PreviousArgs &...pargs) { \ + auto &arg = std::get(ctx); \ + auto tensor = PD_TensorVector(reinterpret_cast( \ + const_cast *>(&arg))); \ + auto tensor_ptr_vec = PD_GetPointerVector(&arg); \ + return CustomKernelCallHelper::template VariadicCompute( \ + ctx, pargs..., tensor_ptr_vec); \ + } \ } -#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(attr_type) \ - template \ - struct CustomKernelCallHelper { \ - template \ - static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ - static_assert(out_idx == 0, \ - "Kernel's Attributes should appear before Outputs."); \ - attr_type arg = PD_AttrAt(ctx, attr_idx); \ - CustomKernelCallHelper:: \ - template Compute( \ - ctx, pargs..., arg); \ - } \ +#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_ATTRIBUTE(attr_type) \ + template \ + struct CustomKernelCallHelper { \ + template \ + static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ + static_assert(out_idx == 0, \ + "Kernel's Attributes should appear before Outputs."); \ + attr_type arg = PD_AttrAt(ctx, attr_idx); \ + CustomKernelCallHelper:: \ + template Compute( \ + ctx, pargs..., arg); \ + } \ + template \ + static void VariadicCompute(const std::tuple &ctx, \ + PreviousArgs &...pargs) { \ + auto &arg = std::get(ctx); \ + auto attr = PD_Attr(reinterpret_cast(&arg)); \ + return CustomKernelCallHelper::template VariadicCompute( \ + ctx, pargs..., attr); \ + } \ } -#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF( \ - attr_type) \ - template \ - struct CustomKernelCallHelper { \ - template \ - static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ - static_assert(out_idx == 0, \ - "Kernel's Attributes should appear before Outputs."); \ - attr_type arg = PD_AttrAt(ctx, attr_idx); \ - CustomKernelCallHelper:: \ - template Compute( \ - ctx, pargs..., arg); \ - } \ +#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_CONST_ATTRIBUTE_REF( \ + attr_type) \ + template \ + struct CustomKernelCallHelper { \ + template \ + static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ + static_assert(out_idx == 0, \ + "Kernel's Attributes should appear before Outputs."); \ + attr_type arg = PD_AttrAt(ctx, attr_idx); \ + CustomKernelCallHelper:: \ + template Compute( \ + ctx, pargs..., arg); \ + } \ + template \ + static void VariadicCompute(const std::tuple &ctx, \ + PreviousArgs &...pargs) { \ + const attr_type &arg = std::get(ctx); \ + auto attr = PD_Attr( \ + reinterpret_cast(const_cast(&arg))); \ + return CustomKernelCallHelper::template VariadicCompute( \ + ctx, pargs..., attr); \ + } \ } -#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_OUTPUT(tensor_type) \ - template \ - struct CustomKernelCallHelper { \ - template \ - static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ - auto arg = PD_OutputAt(ctx, out_idx); \ - tensor_type *ptr = (arg.raw_data() ? &arg : nullptr); \ - CustomKernelCallHelper:: \ - template Compute( \ - ctx, pargs..., ptr); \ - } \ +#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_OUTPUT(tensor_type) \ + template \ + struct CustomKernelCallHelper { \ + template \ + static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ + auto arg = PD_OutputAt(ctx, out_idx); \ + tensor_type *ptr = (arg.raw_data() ? &arg : nullptr); \ + CustomKernelCallHelper:: \ + template Compute( \ + ctx, pargs..., ptr); \ + } \ + template \ + static void VariadicCompute(const std::tuple &ctx, \ + PreviousArgs &...pargs) { \ + tensor_type *arg = std::get(ctx); \ + auto tensor = \ + phi::capi::DenseTensor(reinterpret_cast(arg)); \ + auto tensor_ptr = tensor.raw_data() ? &tensor : nullptr; \ + return CustomKernelCallHelper::template VariadicCompute( \ + ctx, pargs..., tensor_ptr); \ + } \ } -#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_OUTPUT(tensor_type) \ - template \ - struct CustomKernelCallHelper, Tail...> { \ - template \ - static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ - auto arg = PD_MultiOutputAt(ctx, out_idx); \ - auto arg_wrapper = PD_GetPointerVector(&arg); \ - CustomKernelCallHelper:: \ - template Compute( \ - ctx, pargs..., arg_wrapper); \ - } \ +#define PD_SPECIALIZE_CustomKernelCallHelper_FOR_MULTI_OUTPUT(tensor_type) \ + template \ + struct CustomKernelCallHelper, Tail...> { \ + template \ + static void Compute(PD_KernelContext *ctx, PreviousArgs &...pargs) { \ + auto arg = PD_MultiOutputAt(ctx, out_idx); \ + std::vector tensor_ptr_vec; \ + for (auto &tensor : arg) { \ + tensor_ptr_vec.push_back(tensor.raw_data() ? &tensor : nullptr); \ + } \ + CustomKernelCallHelper:: \ + template Compute( \ + ctx, pargs..., tensor_ptr_vec); \ + } \ + template \ + static void VariadicCompute(const std::tuple &ctx, \ + PreviousArgs &...pargs) { \ + std::vector &arg = std::get(ctx); \ + auto tensor_vec = PD_TensorVector(reinterpret_cast( \ + const_cast *>(&arg))); \ + std::vector tensor_ptr_vec; \ + for (auto &tensor : tensor_vec) { \ + tensor_ptr_vec.push_back(tensor.raw_data() ? &tensor : nullptr); \ + } \ + return CustomKernelCallHelper::template VariadicCompute( \ + ctx, pargs..., tensor_ptr_vec); \ + } \ } template @@ -627,9 +716,10 @@ struct CustomKernelImpl { template Compute<0, 0, 0, 0>(ctx); } - static void VariadicCompute(const phi::capi::DeviceContext &dev_ctx, - Args... args) { - return kernel_fn(static_cast(dev_ctx), std::forward(args)...); + static void VariadicCompute(DevCtx dev_ctx, Args... args) { + const std::tuple args_tuple(dev_ctx, args...); + return CustomKernelCallHelper>:: + template VariadicCompute<0>(args_tuple); } private: @@ -693,6 +783,13 @@ struct CustomKernelImpl { static_assert(out_idx > 0, "Kernel should have output argument."); return kernel_fn(dev_ctx, args...); } + + template + static void VariadicCompute(const std::tuple &ctx, + DevCtx dev_ctx, + Args... args) { + return kernel_fn(dev_ctx, args...); + } }; }; diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc index 2e14b019c19ff..d38a19038e314 100644 --- a/paddle/phi/capi/lib/c_kernel_context.cc +++ b/paddle/phi/capi/lib/c_kernel_context.cc @@ -220,4 +220,89 @@ PD_DataLayout PD_KernelContextDataLayoutAttrAt(PD_KernelContext* ctx, kernel_context->AttrAt(index)); } +// eager +const char* PD_StringAttr(void* attr) { + auto* str = reinterpret_cast(attr); + return str->c_str(); +} + +PD_DataType PD_DatatTypeAttr(void* attr) { + auto* dtype = reinterpret_cast(attr); + return phi::capi::ToPDDataType(*dtype); +} + +PD_DataLayout PD_DatatLayoutAttr(void* attr) { + auto* layout = reinterpret_cast(attr); + return phi::capi::ToPDDataLayout(*layout); +} + +PD_List PD_ListInt32Attr(void* attr) { + PD_List list; + const auto& cc_list = *reinterpret_cast*>(attr); + list.size = cc_list.size(); + list.data = const_cast(cc_list.data()); + return list; +} + +PD_List PD_ListInt64Attr(void* attr) { + PD_List list; + const auto& cc_list = *reinterpret_cast*>(attr); + list.size = cc_list.size(); + list.data = const_cast(cc_list.data()); + return list; +} + +PD_List PD_ListFloatAttr(void* attr) { + PD_List list; + const auto& cc_list = *reinterpret_cast*>(attr); + list.size = cc_list.size(); + list.data = const_cast(cc_list.data()); + return list; +} + +PD_List PD_ListDoubleAttr(void* attr) { + PD_List list; + const auto& cc_list = *reinterpret_cast*>(attr); + list.size = cc_list.size(); + list.data = const_cast(cc_list.data()); + return list; +} + +PD_List PD_ListScalarAttr(void* attr) { + PD_List list; + const auto& cc_list = *reinterpret_cast*>(attr); + list.size = cc_list.size(); + auto data = new PD_Scalar*[list.size]; + for (size_t i = 0; i < list.size; ++i) { + data[i] = + const_cast(reinterpret_cast(&cc_list[i])); + } + list.data = data; + return list; +} + +PD_List PD_ListStringAttr(void* attr) { + PD_List list; + const auto& cc_list = *reinterpret_cast*>(attr); + list.size = cc_list.size(); + auto data = new char*[list.size]; + for (size_t i = 0; i < list.size; ++i) { + data[i] = const_cast(cc_list[i].data()); + } + list.data = reinterpret_cast(data); + return list; +} + +PD_List PD_ListBoolAttr(void* attr) { + PD_List list; + const auto& cc_list = *reinterpret_cast*>(attr); + list.size = cc_list.size(); + auto data = reinterpret_cast(new uint8_t[cc_list.size()]); + for (size_t i = 0; i < cc_list.size(); ++i) { + data[i] = static_cast(cc_list[i]); + } + list.data = data; + return list; +} + PD_REGISTER_CAPI(kernel_context); diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc index cd0bbd62d88a0..c81eefe22f77e 100644 --- a/paddle/phi/capi/lib/c_tensor.cc +++ b/paddle/phi/capi/lib/c_tensor.cc @@ -299,4 +299,19 @@ void PD_TensorShareLoDWith(PD_Tensor* dst, meta_dst.share_lod(meta_src); } +PD_Tensor* PD_OptionalTensorGetPointer(PD_Tensor* tensor) { + auto cc_tensor = + reinterpret_cast*>(tensor); + return reinterpret_cast(cc_tensor->get_ptr()); +} + +PD_List PD_TensorVectorToList(PD_Tensor* tensor) { + auto cc_tensor = + reinterpret_cast*>(tensor); + PD_List list; + list.size = cc_tensor->size(); + list.data = cc_tensor->data(); + return list; +} + PD_REGISTER_CAPI(tensor); diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt index acd441c867787..fa2ea2726cfab 100644 --- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt @@ -1,3 +1,4 @@ if(WITH_CUSTOM_DEVICE) - py_test(test_custom_device_data_loader SRCS test_custom_device_data_loader.py) + py_test(test_custom_cpu_plugin SRCS test_custom_cpu_plugin.py) + set_tests_properties(test_custom_cpu_plugin PROPERTIES TIMEOUT 120) endif() diff --git a/python/paddle/fluid/tests/custom_runtime/custom_cpu_runtime.cc b/python/paddle/fluid/tests/custom_runtime/custom_cpu_runtime.cc deleted file mode 100644 index 18762625c0fe2..0000000000000 --- a/python/paddle/fluid/tests/custom_runtime/custom_cpu_runtime.cc +++ /dev/null @@ -1,215 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "paddle/phi/backends/device_ext.h" - -#define MEMORY_FRACTION 0.5f - -C_Status Init() { return C_SUCCESS; } - -C_Status InitDevice(const C_Device device) { return C_SUCCESS; } - -C_Status SetDevice(const C_Device device) { return C_SUCCESS; } - -C_Status GetDevice(const C_Device device) { - device->id = 0; - return C_SUCCESS; -} - -C_Status DestroyDevice(const C_Device device) { return C_SUCCESS; } - -C_Status Finalize() { return C_SUCCESS; } - -C_Status GetDevicesCount(size_t *count) { - *count = 1; - return C_SUCCESS; -} - -C_Status GetDevicesList(size_t *devices) { - devices[0] = 0; - return C_SUCCESS; -} - -C_Status MemCpy(const C_Device device, - void *dst, - const void *src, - size_t size) { - memcpy(dst, src, size); - return C_SUCCESS; -} - -C_Status AsyncMemCpy(const C_Device device, - C_Stream stream, - void *dst, - const void *src, - size_t size) { - memcpy(dst, src, size); - return C_SUCCESS; -} - -C_Status MemCpyP2P(const C_Device dst_device, - const C_Device src_device, - void *dst, - const void *src, - size_t size) { - memcpy(dst, src, size); - return C_SUCCESS; -} - -C_Status AsyncMemCpyP2P(const C_Device dst_device, - const C_Device src_device, - C_Stream stream, - void *dst, - const void *src, - size_t size) { - memcpy(dst, src, size); - return C_SUCCESS; -} - -C_Status Allocate(const C_Device device, void **ptr, size_t size) { - auto data = malloc(size); - if (data) { - *ptr = data; - return C_SUCCESS; - } else { - *ptr = nullptr; - } - return C_FAILED; -} - -C_Status Deallocate(const C_Device device, void *ptr, size_t size) { - free(ptr); - return C_SUCCESS; -} - -C_Status CreateStream(const C_Device device, C_Stream *stream) { - stream = nullptr; - return C_SUCCESS; -} - -C_Status DestroyStream(const C_Device device, C_Stream stream) { - return C_SUCCESS; -} - -C_Status CreateEvent(const C_Device device, C_Event *event) { - return C_SUCCESS; -} - -C_Status RecordEvent(const C_Device device, C_Stream stream, C_Event event) { - return C_SUCCESS; -} - -C_Status DestroyEvent(const C_Device device, C_Event event) { - return C_SUCCESS; -} - -C_Status SyncDevice(const C_Device device) { return C_SUCCESS; } - -C_Status SyncStream(const C_Device device, C_Stream stream) { - return C_SUCCESS; -} - -C_Status SyncEvent(const C_Device device, C_Event event) { return C_SUCCESS; } - -C_Status StreamWaitEvent(const C_Device device, - C_Stream stream, - C_Event event) { - return C_SUCCESS; -} - -C_Status VisibleDevices(size_t *devices) { return C_SUCCESS; } - -C_Status DeviceMemStats(const C_Device device, - size_t *total_memory, - size_t *free_memory) { - float memusage; - FILE *fp; - char buffer[1024]; - size_t byte_read; - char *pos; - - fp = fopen("/proc/meminfo", "r"); - byte_read = fread(buffer, 1, sizeof(buffer), fp); - fclose(fp); - buffer[byte_read] = '\0'; - pos = strstr(buffer, "MemTotal:"); - sscanf(pos, "MemTotal: %lu kB", total_memory); - pos = strstr(pos, "MemFree:"); - sscanf(pos, "MemFree: %lu kB", free_memory); - *total_memory = *total_memory * 1024; - *free_memory = *free_memory * 1024; - *free_memory = *free_memory * MEMORY_FRACTION; - - return C_SUCCESS; -} - -C_Status DeviceMinChunkSize(const C_Device device, size_t *size) { - *size = 512; - return C_SUCCESS; -} - -void InitPlugin(CustomRuntimeParams *params) { - PADDLE_CUSTOM_RUNTIME_CHECK_VERSION(params); - params->device_type = "custom_cpu"; - params->sub_device_type = "v0.1"; - - memset(reinterpret_cast(params->interface), - 0, - sizeof(C_DeviceInterface)); - - params->interface->initialize = Init; - params->interface->finalize = Finalize; - - params->interface->init_device = InitDevice; - params->interface->set_device = SetDevice; - params->interface->get_device = GetDevice; - params->interface->deinit_device = DestroyDevice; - - params->interface->create_stream = CreateStream; - params->interface->destroy_stream = DestroyStream; - - params->interface->create_event = CreateEvent; - params->interface->destroy_event = DestroyEvent; - params->interface->record_event = RecordEvent; - - params->interface->synchronize_device = SyncDevice; - params->interface->synchronize_stream = SyncStream; - params->interface->synchronize_event = SyncEvent; - params->interface->stream_wait_event = StreamWaitEvent; - - params->interface->memory_copy_h2d = MemCpy; - params->interface->memory_copy_d2d = MemCpy; - params->interface->memory_copy_d2h = MemCpy; - params->interface->memory_copy_p2p = MemCpyP2P; - params->interface->async_memory_copy_h2d = AsyncMemCpy; - params->interface->async_memory_copy_d2d = AsyncMemCpy; - params->interface->async_memory_copy_d2h = AsyncMemCpy; - params->interface->async_memory_copy_p2p = AsyncMemCpyP2P; - params->interface->device_memory_allocate = Allocate; - params->interface->host_memory_allocate = Allocate; - params->interface->unified_memory_allocate = Allocate; - params->interface->device_memory_deallocate = Deallocate; - params->interface->host_memory_deallocate = Deallocate; - params->interface->unified_memory_deallocate = Deallocate; - - params->interface->get_device_count = GetDevicesCount; - params->interface->get_device_list = GetDevicesList; - params->interface->device_memory_stats = DeviceMemStats; - params->interface->device_min_chunk_size = DeviceMinChunkSize; -} diff --git a/python/paddle/fluid/tests/custom_runtime/custom_cpu_setup.py b/python/paddle/fluid/tests/custom_runtime/custom_cpu_setup.py deleted file mode 100644 index 82accb2ad00df..0000000000000 --- a/python/paddle/fluid/tests/custom_runtime/custom_cpu_setup.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import site -from paddle.fluid import core -from distutils.sysconfig import get_python_lib -from distutils.core import setup, Extension -from setuptools.command.build_ext import build_ext - - -# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes -# Avoid a gcc warning below: -# cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid -# for C/ObjC but not for C++ -class BuildExt(build_ext): - - def build_extensions(self): - if '-Wstrict-prototypes' in self.compiler.compiler_so: - self.compiler.compiler_so.remove('-Wstrict-prototypes') - super(BuildExt, self).build_extensions() - - -# cc flags -paddle_extra_compile_args = [ - '-std=c++14', - '-shared', - '-fPIC', - '-Wno-parentheses', - '-DPADDLE_WITH_CUSTOM_KERNEL', - '-DPADDLE_WITH_CUSTOM_DEVICE', -] -if core.is_compiled_with_npu(): - paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0'] - -# include path -site_packages_path = site.getsitepackages() -include_dirs = list( - map(lambda path: os.path.join(path, 'paddle', 'include'), - site_packages_path)) - -# include path third_party -compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'], - 'build/third_party') -include_dirs += [ - os.path.join(compile_third_party_path, 'boost/src/extern_boost'), # boost - os.path.join(compile_third_party_path, 'install/gflags/include'), # gflags - os.path.join(compile_third_party_path, 'install/glog/include'), # glog -] - -# libs path -library_dirs = list( - map(lambda path: os.path.join(path, 'paddle', 'fluid'), site_packages_path)) - -# libs -libs = [':core_avx.so'] -if not core.has_avx_core and core.has_noavx_core: - libs = [':core_noavx.so'] - -custom_cpu_plugin_so = Extension('custom_cpu_runtime', - sources=['custom_cpu_runtime.cc'], - include_dirs=include_dirs, - library_dirs=library_dirs, - libraries=libs, - extra_compile_args=paddle_extra_compile_args) - -setup(name='custom_kernel_dot', - version='1.0', - description='custom kernel fot compiling', - cmdclass={'build_ext': BuildExt}, - ext_modules=[custom_cpu_plugin_so]) diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py new file mode 100644 index 0000000000000..7da4f38a83686 --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py @@ -0,0 +1,131 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import site +import unittest +import numpy as np + + +class TestCustomCPUPlugin(unittest.TestCase): + + def setUp(self): + # compile so and set to current path + cur_dir = os.path.dirname(os.path.abspath(__file__)) + cmd = 'rm -rf PaddleCustomDevice && git clone https://github.com/PaddlePaddle/PaddleCustomDevice.git && cd PaddleCustomDevice/backends/custom_cpu && mkdir build && cd build && cmake .. && make -j8' + os.system(cmd) + + # set environment for loading and registering compiled custom kernels + # only valid in current process + os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join( + cur_dir, 'PaddleCustomDevice/backends/custom_cpu/build') + + def test_custom_device_dataloader(self): + import paddle + + with paddle.fluid.framework._test_eager_guard(): + self._test_custom_device_dataloader() + self._test_custom_device_dataloader() + + def _test_custom_device_dataloader(self): + import paddle + + paddle.set_device('custom_cpu') + dataset = paddle.vision.datasets.MNIST( + mode='test', + transform=paddle.vision.transforms.Compose([ + paddle.vision.transforms.CenterCrop(20), + paddle.vision.transforms.RandomResizedCrop(14), + paddle.vision.transforms.Normalize(), + paddle.vision.transforms.ToTensor() + ])) + loader = paddle.io.DataLoader(dataset, + batch_size=32, + num_workers=1, + shuffle=True) + for image, label in loader: + self.assertTrue(image.place.is_custom_place()) + self.assertTrue(label.place.is_custom_place()) + break + + def test_custom_device_mnist(self): + import paddle + + with paddle.fluid.framework._test_eager_guard(): + self._test_custom_device_mnist() + self._test_custom_device_mnist() + + def _test_custom_device_mnist(self): + import paddle + + class MNIST(paddle.nn.Layer): + + def __init__(self): + super(MNIST, self).__init__() + self.shape = 1 * 28 * 28 + self.size = 10 + self.output_weight = self.create_parameter( + [self.shape, self.size]) + self.accuracy = paddle.metric.Accuracy() + + def forward(self, inputs, label=None): + x = paddle.reshape(inputs, shape=[-1, self.shape]) + x = paddle.matmul(x, self.output_weight) + x = paddle.nn.functional.softmax(x) + if label is not None: + self.accuracy.reset() + correct = self.accuracy.compute(x, label) + self.accuracy.update(correct) + acc = self.accuracy.accumulate() + return x, acc + else: + return x + + paddle.set_device('custom_cpu') + dataset = paddle.vision.datasets.MNIST( + mode='train', + transform=paddle.vision.transforms.Compose( + [paddle.vision.transforms.ToTensor()])) + loader = paddle.io.DataLoader(dataset, + batch_size=64, + num_workers=1, + shuffle=True) + + mnist = MNIST() + sgd = paddle.optimizer.SGD(learning_rate=0.01, + parameters=mnist.parameters()) + + data = next(loader()) + img = data[0] + label = data[1] + label_int32 = paddle.cast(label, 'int32') + + pred, acc = mnist(img, label_int32) + avg_loss = paddle.nn.functional.cross_entropy(pred, label_int32) + avg_loss.backward() + sgd.step() + sgd.clear_grad() + + self.assertTrue(pred.place.is_custom_place()) + + def tearDown(self): + del os.environ['CUSTOM_DEVICE_ROOT'] + + +if __name__ == '__main__': + if os.name == 'nt' or sys.platform.startswith('darwin'): + # only support Linux now + exit() + unittest.main() diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_device_data_loader.py b/python/paddle/fluid/tests/custom_runtime/test_custom_device_data_loader.py deleted file mode 100644 index 775c3f487d596..0000000000000 --- a/python/paddle/fluid/tests/custom_runtime/test_custom_device_data_loader.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import site -import unittest -import numpy as np - - -class TestCustomDeviceDataLoader(unittest.TestCase): - - def setUp(self): - # compile so and set to current path - cur_dir = os.path.dirname(os.path.abspath(__file__)) - - # --inplace to place output so file to current dir - cmd = 'cd {} && {} custom_cpu_setup.py build_ext --inplace'.format( - cur_dir, sys.executable) - os.system(cmd) - - # set environment for loading and registering compiled custom kernels - # only valid in current process - os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir - - def test_custom_device_dataloader(self): - import paddle - - paddle.set_device('custom_cpu') - dataset = paddle.vision.datasets.MNIST( - mode='test', - transform=paddle.vision.transforms.Compose([ - paddle.vision.transforms.CenterCrop(20), - paddle.vision.transforms.RandomResizedCrop(14), - paddle.vision.transforms.Normalize(), - paddle.vision.transforms.ToTensor() - ])) - loader = paddle.io.DataLoader(dataset, - batch_size=32, - num_workers=1, - shuffle=True) - for image, label in loader: - self.assertTrue(image.place.is_custom_place()) - self.assertTrue(label.place.is_custom_place()) - break - - def tearDown(self): - del os.environ['CUSTOM_DEVICE_ROOT'] - - -if __name__ == '__main__': - if os.name == 'nt' or sys.platform.startswith('darwin'): - # only support Linux now - exit() - unittest.main() From 7cf72a3882a9cc60f1fecffe3cf3bad72f6040cf Mon Sep 17 00:00:00 2001 From: pangyoki Date: Wed, 13 Jul 2022 11:54:46 +0800 Subject: [PATCH 170/250] add shape attribute in fill_constant op converted from scale_loss_grad after convert graph to program (#43898) * fix grad loss shape * little change * delete for_test * add unittest for FLAGS_CONVERT_GRAPH_TO_PROGRAM * avoid conflict --- paddle/fluid/framework/ir/graph_helper.cc | 1 + python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt | 3 +++ 2 files changed, 4 insertions(+) diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 315fe3b1e7eee..97f486065ac62 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -452,6 +452,7 @@ static OpDesc *ReplaceScaleLossGradOp(const Node &node, OpDesc *desc) { OpProtoAndCheckerMaker::OpRoleAttrName(), (static_cast(OpRole::kBackward) | static_cast(OpRole::kLoss))); desc->SetAttr("value", 1.0f); + desc->SetAttr("shape", std::vector({1})); std::vector output_names; for (auto out : node.outputs) { output_names.emplace_back(out->Name()); diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt index c60a7511022b4..ee215ebf27a39 100644 --- a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt @@ -55,3 +55,6 @@ py_test_modules( py_test_modules( test_standalone_executor_serial_run MODULES test_standalone_executor ENVS FLAGS_new_executor_serial_run=true) + +py_test_modules(test_convert_graph_to_program MODULES test_standalone_executor + ENVS FLAGS_CONVERT_GRAPH_TO_PROGRAM=true) From 2af286a66f2b7db159219f28ea06fdd0ba65fd94 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Wed, 13 Jul 2022 13:15:16 +0800 Subject: [PATCH 171/250] fix bugs of paddle.linalg.lstsq (#44280) --- paddle/fluid/operators/lstsq_op.cu | 23 +++++++++++----- .../tests/unittests/test_linalg_lstsq_op.py | 26 ++++++++++++++++--- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu index d0b44d0ec88f4..82a56af7eb4f1 100644 --- a/paddle/fluid/operators/lstsq_op.cu +++ b/paddle/fluid/operators/lstsq_op.cu @@ -100,7 +100,7 @@ class LstsqCUDAKernel : public framework::OpKernel { true, batch_count, m, - n, + nrhs, k, x_data, x_stride, @@ -137,14 +137,17 @@ class LstsqCUDAKernel : public framework::OpKernel { // Step 2, solve R^H Z = Y Tensor trans_r = dito.Transpose(new_x); + Tensor slice_r = dito.Slice(trans_r, {-2}, {0}, {min_mn}); + Tensor res_r = dito.TrilTriu(slice_r, 0, false); + phi::TriangularSolveKernel( - phi_dev_ctx, trans_r, new_y, true, true, false, solution); + phi_dev_ctx, res_r, new_y, true, true, false, solution); // Step 3, X <- Q Z BatchedOrgqr(dev_ctx, batch_count, n, - n, + m, min_mn, x_data, n, @@ -183,8 +186,6 @@ void BatchedOrmqr( auto handle = dev_ctx.cusolver_dn_handle(); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSormqr_bufferSize( handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); - float* workspace_ptr = reinterpret_cast(workspace->ptr()); auto info = memory::Alloc(dev_ctx, sizeof(int)); int* info_d = reinterpret_cast(info->ptr()); @@ -192,6 +193,11 @@ void BatchedOrmqr( float* a_working_ptr = &a[i * a_stride]; float* tau_working_ptr = &tau[i * tau_stride]; float* other_working_ptr = &other[i * other_stride]; + + handle = dev_ctx.cusolver_dn_handle(); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + // compute ormgr PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnSormqr(handle, @@ -249,8 +255,6 @@ void BatchedOrmqr( auto handle = dev_ctx.cusolver_dn_handle(); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDormqr_bufferSize( handle, side, trans, m, n, k, a, lda, tau, other, ldc, &lwork)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); - double* workspace_ptr = reinterpret_cast(workspace->ptr()); auto info = memory::Alloc(dev_ctx, sizeof(int)); int* info_d = reinterpret_cast(info->ptr()); @@ -258,6 +262,11 @@ void BatchedOrmqr( double* a_working_ptr = &a[i * a_stride]; double* tau_working_ptr = &tau[i * tau_stride]; double* other_working_ptr = &other[i * other_stride]; + + handle = dev_ctx.cusolver_dn_handle(); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + // compute ormgr PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cusolverDnDormqr(handle, diff --git a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py index 07729ae4e79cf..60414b8de97a5 100644 --- a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py +++ b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py @@ -175,6 +175,16 @@ def init_config(self): self._input_shape_2 = (5, 8) +class LinalgLstsqTestCase3(LinalgLstsqTestCase): + + def init_config(self): + self.dtype = 'float64' + self.rcond = 1e-15 + self.driver = "gels" + self._input_shape_1 = (10, 7, 3) + self._input_shape_2 = (10, 7, 6) + + class LinalgLstsqTestCaseRcond(LinalgLstsqTestCase): def init_config(self): @@ -192,7 +202,17 @@ def init_config(self): self.rcond = None self.driver = "gels" self._input_shape_1 = (10, 5) - self._input_shape_2 = (10, 2) + self._input_shape_2 = (10, 8) + + +class LinalgLstsqTestCaseGelsFloat64(LinalgLstsqTestCase): + + def init_config(self): + self.dtype = 'float32' + self.rcond = None + self.driver = "gels" + self._input_shape_1 = (3, 2, 8) + self._input_shape_2 = (3, 2, 15) class LinalgLstsqTestCaseGelssFloat64(LinalgLstsqTestCase): @@ -230,9 +250,9 @@ class LinalgLstsqTestCaseBatch2(LinalgLstsqTestCase): def init_config(self): self.dtype = 'float64' self.rcond = 1e-15 - self.driver = "gelss" + self.driver = "gels" self._input_shape_1 = (10, 8, 6) - self._input_shape_2 = (10, 8, 2) + self._input_shape_2 = (10, 8, 10) class LinalgLstsqTestCaseLarge1(LinalgLstsqTestCase): From 01b3ccaed6deb4f05b63c547ab756ab6e2b754d6 Mon Sep 17 00:00:00 2001 From: dongfangshenzhu <102794151+dongfangshenzhu@users.noreply.github.com> Date: Wed, 13 Jul 2022 13:59:43 +0800 Subject: [PATCH 172/250] Zhusonghe (#44274) * add relu6 and relu6_grad * change code style of relu6 and relu6_grad * add relu6 and relu6_grad *test=kunlun * add relu6 and relu6_grad *test=kunlun --- paddle/fluid/operators/activation_op_xpu.cc | 22 +++++++++++++++++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 2 ++ 2 files changed, 24 insertions(+) diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc index 613eea90a6500..0e7136b9f6ce8 100644 --- a/paddle/fluid/operators/activation_op_xpu.cc +++ b/paddle/fluid/operators/activation_op_xpu.cc @@ -166,6 +166,24 @@ struct XPUReluGradFunctor : public BaseActivationFunctor { } }; +template +struct XPURelu6Functor : public BaseActivationFunctor { + using XPUType = typename XPUTypeTrait::Type; + void operator()(const framework::ExecutionContext &ctx) const { + xpu_activation_forward( + ctx, xpu::relu6); + } +}; + +template +struct XPURelu6GradFunctor : public BaseActivationFunctor { + using XPUType = typename XPUTypeTrait::Type; + void operator()(const framework::ExecutionContext &ctx) const { + xpu_activation_backward( + ctx, xpu::relu6_grad); + } +}; + template struct XPUSigmoidFunctor : public BaseActivationFunctor { using XPUType = typename XPUTypeTrait::Type; @@ -548,6 +566,10 @@ REGISTER_OP_XPU_KERNEL( ops::XPUActivationGradKernel>, ops::XPUActivationGradKernel< ops::XPUReluGradFunctor>); +REGISTER_OP_XPU_KERNEL(relu6, + ops::XPUActivationKernel>); +REGISTER_OP_XPU_KERNEL( + relu6_grad, ops::XPUActivationKernel>); REGISTER_OP_XPU_KERNEL( tanh, ops::XPUActivationKernel>, diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 9f07f05ff7fa6..e7570de695f28 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -349,6 +349,8 @@ XPUOpMap& get_kl2_ops() { {"reduce_sum_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu6", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu6_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, From d4699bd6efeaf880a4fee40fa52cb1e93eb896d3 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Wed, 13 Jul 2022 14:08:13 +0800 Subject: [PATCH 173/250] Enable test_tensordot (#42932) * Enable test_tesnsordot * Fix CI errors * Disable windows inference test * Fix typos * Fix typos * Fix CI errors * Remove disable_wingpu11_test --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 +--- .../paddle/fluid/tests/unittests/test_tensordot.py | 10 +++++----- tools/windows/run_unittests.sh | 12 ++++-------- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 606f39c5e3b42..28bd796efdcee 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -205,8 +205,6 @@ endif() # Temporally disable test_deprecated_decorator list(REMOVE_ITEM TEST_OPS test_deprecated_decorator) -list(REMOVE_ITEM TEST_OPS test_tensordot) - if(WIN32) list(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception) list(REMOVE_ITEM TEST_OPS test_trainer_desc) @@ -1419,7 +1417,7 @@ set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120) -#set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200) +set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200) set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120) set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py index e5d563455e896..bcba40090b858 100644 --- a/python/paddle/fluid/tests/unittests/test_tensordot.py +++ b/python/paddle/fluid/tests/unittests/test_tensordot.py @@ -185,35 +185,35 @@ def set_dtype(self): self.dtype = np.float64 -class TestTensordotAPIBroadcastCase1(TestTensordotAPI): +class TestTensordotAPIBroadcastCase1(TestTensordotAPIFloat64): def set_input_shape(self): self.x_shape = [1, 1, 1, 5] self.y_shape = [1, 5, 1, 1] -class TestTensordotAPIBroadcastCase2(TestTensordotAPI): +class TestTensordotAPIBroadcastCase2(TestTensordotAPIFloat64): def set_input_shape(self): self.x_shape = [1, 5, 5, 5] self.y_shape = [1, 1, 1, 5] -class TestTensordotAPIBroadcastCase3(TestTensordotAPI): +class TestTensordotAPIBroadcastCase3(TestTensordotAPIFloat64): def set_input_shape(self): self.x_shape = [5, 5, 5, 1] self.y_shape = [5, 5, 1, 5] -class TestTensordotAPIBroadcastCase4(TestTensordotAPI): +class TestTensordotAPIBroadcastCase4(TestTensordotAPIFloat64): def set_input_shape(self): self.x_shape = [5, 5, 5, 1] self.y_shape = [1, 1, 1, 1] -class TestTensordotAPIBroadcastCase5(TestTensordotAPI): +class TestTensordotAPIBroadcastCase5(TestTensordotAPIFloat64): def set_input_shape(self): self.x_shape = [1, 1, 5, 5] diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 7af1cd81391d4..34c3d4156a818 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -72,12 +72,6 @@ disable_win_trt_test="^test_trt_convert_conv2d$|\ ^test_trt_convert_matmul$|\ ^test_trt_convert_scale$" -# /*=============Fixed Disabled Windows CUDA11.x MKL(PR-CI-Windows-Inference) unittests=================*/ -# TODO: fix these unittest that is bound to fail -disable_wingpu11_test="^test_autograd_functional_dynamic$|\ -^disable_wingpu_test$" - - # /*==========Fixed Disabled Windows CUDA11.x inference_api_test(PR-CI-Windows-Inference) unittests=============*/ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_trt_dynamic_shape_ernie$|\ @@ -183,7 +177,9 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_unsqueeze2_eltwise_fuse_pass$|\ ^test_parallel_executor_seresnext_with_fuse_all_reduce_gpu$|\ ^test_parallel_executor_seresnext_with_reduce_gpu$|\ -^test_api_impl$" +^test_api_impl$|\ +^test_tensordot$|\ +^disable_wingpu_test$" # /*==========Fixed Disabled Windows CPU OPENBLAS((PR-CI-Windows-OPENBLAS)) unittests==============================*/ @@ -281,7 +277,7 @@ bash $PADDLE_ROOT/tools/check_added_ut_win.sh rm -rf $PADDLE_ROOT/tools/check_added_ut_win.sh if [ -f "$PADDLE_ROOT/added_ut" ];then added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$ - ctest -R "(${added_uts})" -E "$disable_wingpu11_test" --output-on-failure -C Release --repeat-until-fail 3;added_ut_error=$? + ctest -R "(${added_uts})" -E "${disable_win_inference_test}" --output-on-failure -C Release --repeat-until-fail 3;added_ut_error=$? rm -f $PADDLE_ROOT/added_ut if [ "$added_ut_error" != 0 ];then echo "========================================" From 05d5bbfb7d64514e4f101d9c029b2be3c422f4eb Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 13 Jul 2022 14:42:56 +0800 Subject: [PATCH 174/250] [JIT]Layer supports eager dygraph mode and Polish Function interface (#44283) * [JIT]Layer support eager dygraph mode and polish Function interface * remove usless code * fix #define --- paddle/fluid/jit/base_function.h | 20 ++++---- paddle/fluid/jit/compilation_unit.cc | 2 +- paddle/fluid/jit/executor_function.h | 14 ++++-- paddle/fluid/jit/function_utils.cc | 50 ++++++++++++------- paddle/fluid/jit/function_utils.h | 18 ++++--- paddle/fluid/jit/layer.cc | 11 ++-- paddle/fluid/jit/layer.h | 8 ++- paddle/fluid/jit/layer_test.cc | 27 ++++------ paddle/fluid/jit/pe_function.h | 24 +++++---- paddle/fluid/pybind/eager_functions.cc | 17 +++++++ paddle/fluid/pybind/eager_utils.cc | 16 ++++++ paddle/fluid/pybind/eager_utils.h | 3 ++ paddle/fluid/pybind/jit.cc | 40 ++++----------- .../fluid/tests/unittests/test_jit_layer.py | 1 - python/paddle/jit/layer.py | 3 +- 15 files changed, 146 insertions(+), 108 deletions(-) diff --git a/paddle/fluid/jit/base_function.h b/paddle/fluid/jit/base_function.h index ebe4314a5319e..df774d8fd84c7 100644 --- a/paddle/fluid/jit/base_function.h +++ b/paddle/fluid/jit/base_function.h @@ -14,23 +14,23 @@ #pragma once -#include -#include - -#include "paddle/phi/common/place.h" - -#include "paddle/fluid/framework/variable.h" +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/core/dense_tensor.h" namespace paddle { namespace jit { -using Variable = paddle::framework::Variable; +using Tensor = paddle::experimental::Tensor; +using DenseTensor = phi::DenseTensor; + class BaseFunction { public: - virtual std::vector operator()( - const std::vector &inputs) = 0; + virtual std::vector operator()( + const std::vector &inputs) = 0; + + virtual std::vector operator()(const std::vector &inputs) = 0; + virtual ~BaseFunction() {} - // virtual void SetPalce(const phi::Place &place); }; } // namespace jit diff --git a/paddle/fluid/jit/compilation_unit.cc b/paddle/fluid/jit/compilation_unit.cc index d62c497d8b338..60d42d045b0e3 100644 --- a/paddle/fluid/jit/compilation_unit.cc +++ b/paddle/fluid/jit/compilation_unit.cc @@ -24,7 +24,7 @@ std::shared_ptr CompilationUnit::Function( PADDLE_ENFORCE_EQ( function_map_.count(name), 1, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Funciton name %s is not exist in function_map_.", name)); return function_map_.at(name); } diff --git a/paddle/fluid/jit/executor_function.h b/paddle/fluid/jit/executor_function.h index 36cb438e34cc2..a9b9d59d21bf4 100644 --- a/paddle/fluid/jit/executor_function.h +++ b/paddle/fluid/jit/executor_function.h @@ -42,17 +42,21 @@ class ExecutorFunction : public BaseFunction { ~ExecutorFunction() noexcept {} - std::vector operator()(const std::vector &inputs) { - utils::ShareInputsIntoScope(info_->InputArgNames(), inputs, &scope_); + std::vector operator()(const std::vector &inputs) { + auto dense_tensors = utils::ToDenseTensors(inputs); + return utils::ToTensors(this->operator()(dense_tensors)); + } + + std::vector operator()(const std::vector &inputs) { + utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_); inner_exe_.Run(info_->ProgramDesc(), &scope_, /*blockID=*/0, false, true, info_->OutputArgNames()); - VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_); - std::vector res; - utils::FetchVarsByNames(info_->OutputArgNames(), scope_, &res); + std::vector res; + utils::FetchOuts(info_->OutputArgNames(), scope_, &res); return res; } diff --git a/paddle/fluid/jit/function_utils.cc b/paddle/fluid/jit/function_utils.cc index 4757e784dfe75..a6da061de99dc 100644 --- a/paddle/fluid/jit/function_utils.cc +++ b/paddle/fluid/jit/function_utils.cc @@ -21,36 +21,50 @@ namespace paddle { namespace jit { namespace utils { -void FetchVarsByNames(const std::vector &names, - const framework::Scope &scope, - std::vector *outs) { - for (auto &out_name : names) { + +std::vector ToDenseTensors(const std::vector &tensors) { + std::vector ret; + for (auto &t : tensors) { + ret.emplace_back(*std::dynamic_pointer_cast(t.impl())); + } + return ret; +} + +std::vector ToTensors(const std::vector &tensors) { + std::vector ret; + for (auto &t : tensors) { + ret.emplace_back(std::make_shared(t)); + } + return ret; +} + +void FetchOuts(const std::vector &names, + const framework::Scope &scope, + std::vector *outs) { + outs->reserve(names.size()); + for (size_t i = 0; i < names.size(); ++i) { + auto &out_name = names[i]; VLOG(3) << "fetch out: " << out_name; auto *var = scope.FindVar(out_name); auto &src_tensor = var->Get(); - Variable v; - auto *p = v.GetMutable(); - *p = src_tensor; - outs->emplace_back(v); + outs->emplace_back(src_tensor); } } -void ShareInputsIntoScope(const std::vector &ordered_input_names, - const std::vector &vars, - framework::Scope *scope) { - VLOG(3) << "vars size: " << vars.size(); +void ShareIntoScope(const std::vector &ordered_input_names, + const std::vector &tensors, + framework::Scope *scope) { + VLOG(3) << "tensors size: " << tensors.size(); PADDLE_ENFORCE_EQ( - vars.size(), + tensors.size(), ordered_input_names.size(), platform::errors::InvalidArgument( - "vars.size() should be equal to ordered_input_names.size().")); - - for (size_t i = 0; i < vars.size(); i++) { + "tensors.size() should be equal to ordered_input_names.size().")); + for (size_t i = 0; i < tensors.size(); ++i) { VLOG(3) << "share into scope: " << ordered_input_names[i]; - auto &dense_tensor = vars[i].Get(); auto *var = scope->Var(ordered_input_names[i]); auto *dst_tensor = var->GetMutable(); - *dst_tensor = dense_tensor; + *dst_tensor = tensors[i]; } } diff --git a/paddle/fluid/jit/function_utils.h b/paddle/fluid/jit/function_utils.h index 49db3f71fbdbf..ba1eaf7308be9 100644 --- a/paddle/fluid/jit/function_utils.h +++ b/paddle/fluid/jit/function_utils.h @@ -20,6 +20,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" @@ -30,15 +31,20 @@ namespace jit { using Variable = paddle::framework::Variable; using Name2VariableMap = std::unordered_map; using DenseTensor = phi::DenseTensor; +using Tensor = paddle::experimental::Tensor; + namespace utils { -void FetchVarsByNames(const std::vector &names, - const framework::Scope &scope, - std::vector *outs); +std::vector ToDenseTensors(const std::vector &tensors); +std::vector ToTensors(const std::vector &tensors); -void ShareInputsIntoScope(const std::vector &ordered_input_names, - const std::vector &vars, - framework::Scope *scope); +void FetchOuts(const std::vector &names, + const framework::Scope &scope, + std::vector *outs); + +void ShareIntoScope(const std::vector &ordered_input_names, + const std::vector &vars, + framework::Scope *scope); void ShareParamsIntoScope(const std::vector ¶m_names, const Name2VariableMap ¶ms_dict, diff --git a/paddle/fluid/jit/layer.cc b/paddle/fluid/jit/layer.cc index 6662abd17d2cf..f5985d71b0347 100644 --- a/paddle/fluid/jit/layer.cc +++ b/paddle/fluid/jit/layer.cc @@ -16,9 +16,6 @@ namespace paddle { namespace jit { -// TODO(dev): Make vector, num_slot as in argument -// Layer(const std::shared_ptr& type) : obj_(type, /*num_slot*/ 0U) -// {} Layer::Layer(const std::vector>& infos, const Name2VariableMap& params_dict, const phi::Place& place) @@ -30,7 +27,13 @@ std::shared_ptr Layer::Function(const std::string& name) const { return unit_.Function(name); } -std::vector Layer::forward(const std::vector& inputs) { +std::vector Layer::forward(const std::vector& inputs) { + auto func = Function("forward"); + return (*func)(inputs); +} + +std::vector Layer::forward( + const std::vector& inputs) { auto func = Function("forward"); return (*func)(inputs); } diff --git a/paddle/fluid/jit/layer.h b/paddle/fluid/jit/layer.h index 5c9f61b0d47b3..ee75881fc3156 100644 --- a/paddle/fluid/jit/layer.h +++ b/paddle/fluid/jit/layer.h @@ -32,9 +32,6 @@ using Name2VariableMap = std::unordered_map; class Layer { public: - // TODO(dev): Make vector, num_slot as in argument - // Layer(const std::shared_ptr& type) : obj_(type, /*num_slot*/ 0U) - // {} Layer(const std::vector>& infos, const Name2VariableMap& params_dict, const phi::Place& place); @@ -43,7 +40,9 @@ class Layer { Variable Attribute(const std::string& name) const; - std::vector forward(const std::vector& inputs); + std::vector forward(const std::vector& inputs); + + std::vector forward(const std::vector& inputs); void to(const phi::Place& place); @@ -55,7 +54,6 @@ class Layer { const Name2FunctionMap& FunctionMap() const; private: - // internal::Object obj_; Name2VariableMap params_dict_; Name2VariableMap attrs_dict_; CompilationUnit unit_; diff --git a/paddle/fluid/jit/layer_test.cc b/paddle/fluid/jit/layer_test.cc index 6c9adff385aba..793afacb79dc7 100644 --- a/paddle/fluid/jit/layer_test.cc +++ b/paddle/fluid/jit/layer_test.cc @@ -52,17 +52,16 @@ namespace paddle { namespace jit { using DenseTensor = phi::DenseTensor; -std::vector PrepareInputs(const phi::Place& place) { +std::vector PrepareInputs(const phi::Place& place) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& dev_ctx = *pool.Get(place); - Variable v; - auto* dense_tensor = v.GetMutable(); - dense_tensor->Resize(phi::make_ddim({2, 4})); - dense_tensor->mutable_data(place); - phi::funcs::set_constant(dev_ctx, dense_tensor, 2.); + DenseTensor t; + t.Resize(phi::make_ddim({2, 4})); + t.mutable_data(place); + phi::funcs::set_constant(dev_ctx, &t, 2.); - return {v}; + return {t}; } TEST(CpuLayerTest, Construct) { @@ -72,16 +71,12 @@ TEST(CpuLayerTest, Construct) { auto inputs = PrepareInputs(place); auto outs = layer.forward(inputs); - auto out_vars = outs[0]; - auto out_dense_tensor = out_vars.Get(); - auto out_data = out_dense_tensor.data(); + auto out_data = outs[0].data(); EXPECT_NEAR(out_data[0], 0.02194316, 1e-6); auto func = layer.Function("infer"); outs = (*func)(inputs); - out_vars = outs[0]; - out_dense_tensor = out_vars.Get(); - out_data = out_dense_tensor.data(); + out_data = outs[0].data(); EXPECT_NEAR(out_data[0], 1.41562390, 1e-6); } @@ -98,8 +93,7 @@ TEST(GpuLayerTest, Construct) { auto inputs = PrepareInputs(place); auto outs = layer.forward(inputs); - auto out_vars = outs[0]; - auto out_dense_tensor = out_vars.Get(); + auto out_dense_tensor = outs[0]; phi::Copy( *dev_ctx_gpu, out_dense_tensor, phi::CPUPlace(), true, &cpu_dense_tensor); auto out_data = cpu_dense_tensor.data(); @@ -107,8 +101,7 @@ TEST(GpuLayerTest, Construct) { auto func = layer.Function("infer"); outs = (*func)(inputs); - out_vars = outs[0]; - out_dense_tensor = out_vars.Get(); + out_dense_tensor = outs[0]; phi::Copy( *dev_ctx_gpu, out_dense_tensor, phi::CPUPlace(), true, &cpu_dense_tensor); out_data = cpu_dense_tensor.data(); diff --git a/paddle/fluid/jit/pe_function.h b/paddle/fluid/jit/pe_function.h index a77fd59358660..f174a0e996467 100644 --- a/paddle/fluid/jit/pe_function.h +++ b/paddle/fluid/jit/pe_function.h @@ -43,24 +43,29 @@ class PEFunction : public BaseFunction { ~PEFunction() noexcept {} - std::vector operator()(const std::vector &inputs) { - // bool is_test = true; + std::vector operator()(const std::vector &inputs) { + auto dense_tensors = utils::ToDenseTensors(inputs); + return utils::ToTensors(this->operator()(dense_tensors)); + } + + std::vector operator()(const std::vector &inputs) { std::string prog_string; std::hash string_hash; auto &program_desc = info_->ProgramDesc(); + // TODO(dev): Serialize is very slow. const_cast(&program_desc) ->Proto() ->SerializePartialToString(&prog_string); - // program_desc.Proto()->SerializePartialToString(&prog_string); + int64_t program_id = static_cast(string_hash(prog_string)); const framework::BlockDesc &global_block = program_desc.Block(0); int64_t start_op_index = 0; int64_t end_op_index = static_cast(global_block.OpSize()); - utils::ShareInputsIntoScope(info_->InputArgNames(), inputs, &scope_); + utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_); std::vector input_var_names = info_->InputArgNames(); std::vector output_var_names = info_->OutputArgNames(); - std::vector dout_var_names; + if (end_op_index > start_op_index) { auto cache_info = framework::GetExecutorInfoFromCache(program_desc, place_, @@ -78,9 +83,7 @@ class PEFunction : public BaseFunction { skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), output_var_names.begin(), output_var_names.end()); - skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), - dout_var_names.begin(), - dout_var_names.end()); + framework::details::ParseSafeEagerDeletionSkipVars( program_desc, end_op_index, @@ -89,9 +92,8 @@ class PEFunction : public BaseFunction { } parallel_executor->RunWithoutFetch(skip_eager_delete_vars); } - VLOG(6) << framework::GenScopeTreeDebugInfo(&scope_); - std::vector res; - utils::FetchVarsByNames(info_->OutputArgNames(), scope_, &res); + std::vector res; + utils::FetchOuts(info_->OutputArgNames(), scope_, &res); return res; } diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index f256787805a0f..3fe2cb170d796 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -357,6 +357,19 @@ static std::vector CastAttrsToTragetType( return res; } +static PyObject* eager_api_jit_function_call(PyObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + std::shared_ptr function = + CastPyArg2BaseFunction(PyTuple_GET_ITEM(args, 0), 0); + std::vector ins = + CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); + std::vector outs = (*function)(ins); + return ToPyObject(outs); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args, PyObject* kwargs) { @@ -911,6 +924,10 @@ PyMethodDef variable_functions[] = { (PyCFunction)(void (*)(void))eager_api_read_next_tensor_list, METH_VARARGS | METH_KEYWORDS, NULL}, + {"jit_function_call", + (PyCFunction)(void (*)(void))eager_api_jit_function_call, + METH_VARARGS | METH_KEYWORDS, + NULL}, /**sparse functions**/ {"sparse_coo_tensor", (PyCFunction)(void (*)(void))eager_api_sparse_coo_tensor, diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 9e8065a6a438a..185b81677125d 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -51,6 +51,7 @@ extern PyTypeObject* g_customplace_pytype; extern PyTypeObject* g_framework_tensor_pytype; extern PyTypeObject* g_framework_lodtensorarray_pytype; extern PyTypeObject* g_custom_op_kernel_ctx_pytype; +extern PyTypeObject* g_executor_function_pytype; int TensorDtype2NumpyDtype(phi::DataType dtype) { switch (dtype) { @@ -227,6 +228,21 @@ std::shared_ptr CastPyArg2VarBase(PyObject* obj, return py::cast>(obj); } +std::shared_ptr CastPyArg2BaseFunction(PyObject* obj, + ssize_t arg_pos) { + if (PyObject_IsInstance( + obj, reinterpret_cast(g_executor_function_pytype))) { + return ::pybind11::handle(obj) + .cast>(); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "argument (position %d) must be " + "BaseFunction, but got %s", + arg_pos + 1, + reinterpret_cast(obj->ob_type)->tp_name)); + } +} + std::vector CastPyArg2VectorOfTensor( PyObject* obj, ssize_t arg_pos) { std::vector result; diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 25dcd91bed0d1..b97dcb9cddbec 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -19,6 +19,7 @@ typedef SSIZE_T ssize_t; #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/jit/executor_function.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" @@ -72,6 +73,8 @@ framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, std::unordered_map CastPyArg2Vocab(PyObject* obj, ssize_t arg_pos); std::vector CastPyArg2Strings(PyObject* obj, ssize_t arg_pos); +std::shared_ptr CastPyArg2BaseFunction(PyObject* obj, + ssize_t arg_pos); PyObject* ToPyObject(int value); PyObject* ToPyObject(uint32_t value); diff --git a/paddle/fluid/pybind/jit.cc b/paddle/fluid/pybind/jit.cc index 07b79742f002e..be2ad50400c77 100644 --- a/paddle/fluid/pybind/jit.cc +++ b/paddle/fluid/pybind/jit.cc @@ -28,39 +28,21 @@ namespace py = pybind11; namespace paddle { namespace pybind { +PyTypeObject *g_executor_function_pytype = nullptr; using Variable = paddle::framework::Variable; void BindJit(pybind11::module *m) { py::class_(*m, "Layer", R"DOC(Layer Class.)DOC") - .def("function_dict", &jit::Layer::FunctionMap); - - py::class_>( - *m, "ExectorFunction", R"DOC(ExectorFunction Class.)DOC") - .def("__call__", - [](jit::ExecutorFunction &self, - const std::vector> - &tensor_inputs) { - std::vector var_inputs; - for (auto &tensor : tensor_inputs) { - var_inputs.emplace_back(tensor->Var()); - } - auto var_outputs = self(var_inputs); - - std::vector> tensor_outputs; - auto output_names = self.Info()->OutputArgNames(); - for (size_t i = 0; i < var_outputs.size(); ++i) { - auto var = var_outputs[i]; - std::string name = output_names[i]; - imperative::VariableWrapper var_wrapper(name, var); - auto shared_wrapper = - std::make_shared(var_wrapper); - auto shared_varbase = - std::make_shared(shared_wrapper); - tensor_outputs.emplace_back(shared_varbase); - } - return tensor_outputs; - }) - .def("info", &jit::ExecutorFunction::Info); + .def("function_dict", + &jit::Layer::FunctionMap, + py::return_value_policy::reference); + + py::class_> + executor_function( + *m, "ExectorFunction", R"DOC(ExectorFunction Class.)DOC"); + g_executor_function_pytype = + reinterpret_cast(executor_function.ptr()); + executor_function.def("info", &jit::ExecutorFunction::Info); py::class_>( *m, "FunctionInfo", R"DOC(FunctionInfo Class.)DOC") diff --git a/python/paddle/fluid/tests/unittests/test_jit_layer.py b/python/paddle/fluid/tests/unittests/test_jit_layer.py index 24c0131fd7012..fd77aa599889f 100644 --- a/python/paddle/fluid/tests/unittests/test_jit_layer.py +++ b/python/paddle/fluid/tests/unittests/test_jit_layer.py @@ -22,7 +22,6 @@ from paddle.jit.layer import Layer from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator -_enable_legacy_dygraph() paddle.seed(1) diff --git a/python/paddle/jit/layer.py b/python/paddle/jit/layer.py index 8ee3652dca843..4aee7a8f5c02a 100644 --- a/python/paddle/jit/layer.py +++ b/python/paddle/jit/layer.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from paddle.fluid import core from paddle.fluid.core import Load @@ -39,7 +40,7 @@ def __init__(self, function): self.info = FunctionInfo(function.info()) def __call__(self, *args): - return self.function(args) + return core.eager.jit_function_call(self.function, args) class FunctionInfo(): From b1aa693e888083226933105a42907f5109753fd7 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Wed, 13 Jul 2022 14:59:41 +0800 Subject: [PATCH 175/250] [Phi] Migrate complex_op into Phi & Add complex api yaml (#44233) * mv to phi * refine infermeta code position * refine grad code * add api yaml and add final_state_api * refine code --- paddle/fluid/operators/complex_op.cc | 78 +++-------- paddle/fluid/operators/complex_op.cu | 28 ---- paddle/fluid/operators/complex_op.h | 123 ------------------ paddle/phi/api/yaml/legacy_api.yaml | 9 ++ paddle/phi/api/yaml/legacy_backward.yaml | 10 ++ paddle/phi/infermeta/backward.cc | 17 +++ paddle/phi/infermeta/backward.h | 6 + paddle/phi/infermeta/binary.cc | 32 +++++ paddle/phi/infermeta/binary.h | 4 + paddle/phi/kernels/complex_grad_kernel.h | 8 ++ paddle/phi/kernels/complex_kernel.h | 6 + paddle/phi/kernels/cpu/complex_grad_kernel.cc | 5 + paddle/phi/kernels/cpu/complex_kernel.cc | 5 + paddle/phi/kernels/gpu/complex_grad_kernel.cu | 5 + paddle/phi/kernels/gpu/complex_kernel.cu | 5 + .../kernels/impl/complex_grad_kernel_impl.h | 48 +++++++ paddle/phi/kernels/impl/complex_kernel_impl.h | 43 ++++++ paddle/phi/ops/compat/complex_sig.cc | 7 + .../fluid/tests/unittests/test_complex_op.py | 1 + python/paddle/tensor/creation.py | 3 + 20 files changed, 230 insertions(+), 213 deletions(-) delete mode 100644 paddle/fluid/operators/complex_op.cu delete mode 100644 paddle/fluid/operators/complex_op.h diff --git a/paddle/fluid/operators/complex_op.cc b/paddle/fluid/operators/complex_op.cc index d6d93fe958118..778f5831c0fbb 100644 --- a/paddle/fluid/operators/complex_op.cc +++ b/paddle/fluid/operators/complex_op.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/complex_op.h" - -#include - +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -59,36 +59,6 @@ class ComplexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "complex"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "complex"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "complex"); - - if (ctx->GetInputDim("X") == ctx->GetInputDim("Y")) { - ctx->ShareDim("X", /*->*/ "Out"); - // NOTE(chenfeiyu): lod & broadcasting is intrinsically contradictory - // so tensors with lod are not supported here - } else { - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - int max_dim = std::max(x_dims.size(), y_dims.size()); - - // start align axis - int axis = std::abs(x_dims.size() - y_dims.size()); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - ctx->SetOutputDim("Out", phi::make_ddim(out_dims_array)); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -101,25 +71,6 @@ class ComplexGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "complex_grad"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "kron_complex_gradgrad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), - "Input", - framework::GradVarName("Out"), - "complex_grad"); - - auto x_grad_name = framework::GradVarName("X"); - if (ctx->HasOutput(x_grad_name)) { - ctx->ShareDim("X", /*->*/ x_grad_name); - } - - auto y_grad_name = framework::GradVarName("Y"); - if (ctx->HasOutput(y_grad_name)) { - ctx->ShareDim("Y", /*->*/ y_grad_name); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -135,18 +86,21 @@ class ComplexGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(complex, + ComplexInferShapeFunctor, + PD_INFER_META(phi::ComplexInferMeta)); + REGISTER_OPERATOR(complex, ops::ComplexOp, ops::ComplexOpMaker, ops::ComplexGradOpMaker, - ops::ComplexGradOpMaker); - -REGISTER_OPERATOR(complex_grad, ops::ComplexGradOp); + ops::ComplexGradOpMaker, + ComplexInferShapeFunctor); -REGISTER_OP_CPU_KERNEL(complex, - ops::ComplexKernel, - ops::ComplexKernel); +DECLARE_INFER_SHAPE_FUNCTOR(complex_grad, + ComplexGradInferShapeFunctor, + PD_INFER_META(phi::ComplexGradInferMeta)); -REGISTER_OP_CPU_KERNEL(complex_grad, - ops::ComplexGradKernel, - ops::ComplexGradKernel); +REGISTER_OPERATOR(complex_grad, + ops::ComplexGradOp, + ComplexGradInferShapeFunctor); diff --git a/paddle/fluid/operators/complex_op.cu b/paddle/fluid/operators/complex_op.cu deleted file mode 100644 index c9bc2d459e73b..0000000000000 --- a/paddle/fluid/operators/complex_op.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/complex_op.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - complex, - ops::ComplexKernel, - ops::ComplexKernel); - -REGISTER_OP_CUDA_KERNEL( - complex_grad, - ops::ComplexGradKernel, - ops::ComplexGradKernel); diff --git a/paddle/fluid/operators/complex_op.h b/paddle/fluid/operators/complex_op.h deleted file mode 100644 index 5fb19b46ec6a0..0000000000000 --- a/paddle/fluid/operators/complex_op.h +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -// functors to use with ElementwiseComputeEx -template -struct RealAndImagToComplexFunctor { - inline HOSTDEVICE platform::complex operator()(const T x, const T y) { - return platform::complex(x, y); - } -}; - -template -struct ImagAndRealToComplexFunctor { - inline HOSTDEVICE platform::complex operator()(const T y, const T x) { - return platform::complex(x, y); - } -}; - -template -struct ComplexGradForRealFunctor { - inline HOSTDEVICE T operator()(const T x, - const T y, - const platform::complex out, - const platform::complex dout) { - return dout.real; - } -}; - -template -struct ComplexGradForImagFunctor { - inline HOSTDEVICE T operator()(const T x, - const T y, - const platform::complex out, - const platform::complex dout) { - return dout.imag; - } -}; - -template -class ComplexKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - - using C = platform::complex; - z->mutable_data(ctx.GetPlace()); - -// NOTE(chenfeiyu): be careful of the caveats of calling elementwise-related -// facility functions -#if defined(__NVCC__) || defined(__HIPCC__) - ElementwiseComputeEx, DeviceContext, T, C>( - ctx, x, y, /*axis*/ -1, RealAndImagToComplexFunctor(), z); -#else - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T, C>( - ctx, x, y, /*axis*/ -1, RealAndImagToComplexFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T, C>( - ctx, x, y, /*axis*/ -1, ImagAndRealToComplexFunctor(), z); - } -#endif - } -}; - -template -class ComplexGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - using C = platform::complex; - - // skip out in a hacky way - auto* out = dout; - ElemwiseGradCompute, - ComplexGradForImagFunctor, - C>(ctx, - *x, - *y, - *out, - *dout, - /*axis*/ -1, - dx, - dy, - ComplexGradForRealFunctor(), - ComplexGradForImagFunctor()); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index aa86c0f34db55..cd01c23641010 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -342,6 +342,15 @@ func : clip backward : clip_grad +- api : complex + args : (Tensor x, Tensor y) + output : Tensor + infer_meta : + func : ComplexInferMeta + kernel : + func : complex + backward : complex_grad + - api : concat args : (Tensor[] x, Scalar(int64_t) axis) output : Tensor diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index f01598e643420..b4972c68a6477 100644 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -306,6 +306,16 @@ backward : clip_double_grad inplace : (out_grad -> x_grad) +- backward_api : complex_grad + forward : complex (Tensor x, Tensor y) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out_grad) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : ComplexGradInferMeta + kernel : + func : complex_grad + data_type : x + - backward_api : concat_double_grad forward : concat_grad (Tensor[] x, Tensor grad_out, Scalar axis) -> Tensor[](grad_x) args : (Tensor[] grad_x_grad, Scalar axis = 0) diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index f59ea5549bd71..dd2d1eb482c8e 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -83,6 +83,23 @@ void ChannelShuffleGradInferMeta(const MetaTensor& out_grad, x_grad->set_dtype(out_grad.dtype()); } +void ComplexGradInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& dout, + MetaTensor* dx, + MetaTensor* dy) { + auto x_dims = x.dims(); + if (dx) { + dx->set_dims(x_dims); + dx->set_dtype(x.dtype()); + } + auto y_dims = y.dims(); + if (dy) { + dy->set_dims(y_dims); + dy->set_dtype(y.dtype()); + } +} + void ConvTransposeGradInferMeta(const MetaTensor& x, const MetaTensor& filter, const MetaTensor& dout, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 0e7ed640d8ffb..6a4eba74b47be 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -42,6 +42,12 @@ void ChannelShuffleGradInferMeta(const MetaTensor& out_grad, const std::string& data_format, MetaTensor* x_grad); +void ComplexGradInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& dout, + MetaTensor* dx, + MetaTensor* dy); + void ConvTransposeGradInferMeta(const MetaTensor& x, const MetaTensor& filter, const MetaTensor& dout, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 269286d76d954..460b0a9e1bdc4 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/kernels/cpu/conv_util.h" @@ -358,6 +359,37 @@ void CompareAllInferMeta(const MetaTensor& x, out->set_dtype(DataType::BOOL); } +void ComplexInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out) { + if (x.dims() == y.dims()) { + auto sizes = vectorize(x.dims()); + out->set_dims(phi::make_ddim(sizes)); + out->set_dtype(dtype::ToComplex(x.dtype())); + // NOTE(chenfeiyu): lod & broadcasting is intrinsically contradictory + // so tensors with lod are not supported here + } else { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + int max_dim = std::max(x_dims.size(), y_dims.size()); + + // start align axis + int axis = std::abs(x_dims.size() - y_dims.size()); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + phi::funcs::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + out->set_dims(phi::make_ddim(out_dims_array)); + out->set_dtype(dtype::ToComplex(x.dtype())); + } +} + void ConvInferMeta(const MetaTensor& input, const MetaTensor& filter, const std::vector& strides, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 9709edf63ccc0..12922ed536add 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -74,6 +74,10 @@ void CompareInferMeta(const MetaTensor& x, int axis, MetaTensor* out); +void ComplexInferMeta(const MetaTensor& x, + const MetaTensor& y, + MetaTensor* out); + void ConvInferMeta(const MetaTensor& input, const MetaTensor& filter, const std::vector& strides, diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h index be13e2826ea81..91c47538e958d 100644 --- a/paddle/phi/kernels/complex_grad_kernel.h +++ b/paddle/phi/kernels/complex_grad_kernel.h @@ -28,4 +28,12 @@ void ImagGradKernel(const Context& dev_ctx, const DenseTensor& dout, DenseTensor* dx); +template +void ComplexGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy); + } // namespace phi diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index 07f93f9b926f1..ad66b890b3d5a 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -30,6 +30,12 @@ void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); template void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); +template +void ComplexKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + // If T is complex template < typename T, diff --git a/paddle/phi/kernels/cpu/complex_grad_kernel.cc b/paddle/phi/kernels/cpu/complex_grad_kernel.cc index 11b7a05834607..049022f01e7c0 100644 --- a/paddle/phi/kernels/cpu/complex_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/complex_grad_kernel.cc @@ -31,3 +31,8 @@ PD_REGISTER_KERNEL(imag_grad, phi::ImagGradKernel, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL( + complex_grad, CPU, ALL_LAYOUT, phi::ComplexGradKernel, float, double) { + kernel->InputAt(2).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc index bef0b7b747a42..9e6c72ae7c16a 100644 --- a/paddle/phi/kernels/cpu/complex_kernel.cc +++ b/paddle/phi/kernels/cpu/complex_kernel.cc @@ -49,3 +49,8 @@ PD_REGISTER_KERNEL(imag, phi::dtype::complex) { kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); } + +PD_REGISTER_KERNEL( + complex, CPU, ALL_LAYOUT, phi::ComplexKernel, float, double) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu index 450b32291c4bc..e9fd5e1fa5834 100644 --- a/paddle/phi/kernels/gpu/complex_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu @@ -31,3 +31,8 @@ PD_REGISTER_KERNEL(real_grad, phi::RealGradKernel, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL( + complex_grad, GPU, ALL_LAYOUT, phi::ComplexGradKernel, float, double) { + kernel->InputAt(2).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu index d0ee78202b060..5c5bf104128d3 100644 --- a/paddle/phi/kernels/gpu/complex_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_kernel.cu @@ -50,3 +50,8 @@ PD_REGISTER_KERNEL(imag, phi::dtype::complex) { kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); } + +PD_REGISTER_KERNEL( + complex, GPU, ALL_LAYOUT, phi::ComplexKernel, float, double) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h index 03896a2353dda..f7366b32e1105 100644 --- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/elementwise_grad_base.h" #include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -47,4 +48,51 @@ void ImagGradKernel(const Context& dev_ctx, for_range(functor); } +template +struct ComplexGradForRealFunctor { + inline HOSTDEVICE T operator()(const T x, + const T y, + const phi::dtype::complex out, + const phi::dtype::complex dout) { + return dout.real; + } +}; + +template +struct ComplexGradForImagFunctor { + inline HOSTDEVICE T operator()(const T x, + const T y, + const phi::dtype::complex out, + const phi::dtype::complex dout) { + return dout.imag; + } +}; + +template +void ComplexGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy) { + using C = phi::dtype::complex; + + // skip out in a hacky way + auto out = dout; + phi::funcs::ElemwiseGradCompute, + ComplexGradForImagFunctor, + C>(dev_ctx, + x, + y, + out, + dout, + /*axis*/ -1, + dx, + dy, + ComplexGradForRealFunctor(), + ComplexGradForImagFunctor()); +} + } // namespace phi diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h index 72b1328833979..8bd7823411964 100644 --- a/paddle/phi/kernels/impl/complex_kernel_impl.h +++ b/paddle/phi/kernels/impl/complex_kernel_impl.h @@ -15,7 +15,9 @@ #pragma once // See Note [ Why still include the fluid headers? ] +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -61,4 +63,45 @@ void ImagKernel(const Context& dev_ctx, for_range(functor); } +// functors to use with ElementwiseComputeEx +template +struct RealAndImagToComplexFunctor { + inline HOSTDEVICE phi::dtype::complex operator()(const T x, const T y) { + return phi::dtype::complex(x, y); + } +}; + +template +struct ImagAndRealToComplexFunctor { + inline HOSTDEVICE phi::dtype::complex operator()(const T y, const T x) { + return phi::dtype::complex(x, y); + } +}; + +template +void ComplexKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + using C = phi::dtype::complex; + dev_ctx.template Alloc(out); + +// NOTE(chenfeiyu): be careful of the caveats of calling elementwise-related +// facility functions +#if defined(__NVCC__) || defined(__HIPCC__) + phi::funcs::ElementwiseCompute, T, C>( + dev_ctx, x, y, /*axis*/ -1, RealAndImagToComplexFunctor(), out); +#else + auto x_dims = x.dims(); + auto y_dims = y.dims(); + if (x_dims.size() >= y_dims.size()) { + phi::funcs::ElementwiseCompute, T, C>( + dev_ctx, x, y, /*axis*/ -1, RealAndImagToComplexFunctor(), out); + } else { + phi::funcs::ElementwiseCompute, T, C>( + dev_ctx, x, y, /*axis*/ -1, ImagAndRealToComplexFunctor(), out); + } +#endif +} + } // namespace phi diff --git a/paddle/phi/ops/compat/complex_sig.cc b/paddle/phi/ops/compat/complex_sig.cc index 88156677d34df..da47e2c7bc750 100644 --- a/paddle/phi/ops/compat/complex_sig.cc +++ b/paddle/phi/ops/compat/complex_sig.cc @@ -24,7 +24,14 @@ KernelSignature ImagGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("imag_grad", {"Out@GRAD"}, {}, {"X@GRAD"}); } +KernelSignature ComplexGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "complex_grad", {"X", "Y", "Out@GRAD"}, {}, {"X@GRAD", "Y@GRAD"}); +} + } // namespace phi PD_REGISTER_ARG_MAPPING_FN(real_grad, phi::RealGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(imag_grad, phi::ImagGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(complex_grad, phi::ComplexGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py index 1faef17a2ade3..49ad644b0ab75 100644 --- a/python/paddle/fluid/tests/unittests/test_complex_op.py +++ b/python/paddle/fluid/tests/unittests/test_complex_op.py @@ -58,6 +58,7 @@ def init_spec(self): def setUp(self): self.op_type = "complex" + self.python_api = paddle.complex self.init_spec() x = np.random.randn(*self.x_shape).astype(self.dtype) y = np.random.randn(*self.y_shape).astype(self.dtype) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index b73fe74a40ba2..85f8ba4aa4f45 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -1701,6 +1701,9 @@ def complex(real, imag, name=None): # [[0.+0.j 0.+1.j 0.+2.j] # [1.+0.j 1.+1.j 1.+2.j]] """ + if in_dygraph_mode(): + return _C_ops.final_state_complex(real, imag) + if paddle.in_dynamic_mode(): return paddle._C_ops.complex(real, imag) From 77c010a01dfe8892f044517b3f94341c2c9ab086 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Wed, 13 Jul 2022 16:04:18 +0800 Subject: [PATCH 176/250] fix bug of pp (#44276) --- .../pp_utils/p2p_communication.py | 40 ++++++++++++------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py index 17c7f5a9bbc4a..6f917d9f89d6a 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py @@ -54,25 +54,29 @@ def __init__(self): def _recv_shape_dtype(self, group): # recv len(shape) dims = paddle.to_tensor([0]) - paddle.distributed.recv(dims, src=0, group=group) + src_rank = group.ranks[0] + + paddle.distributed.recv(dims, src=src_rank, group=group) dims = dims.item() # recv shape shape = paddle.to_tensor([0] * dims) - paddle.distributed.recv(shape, src=0, group=group) + paddle.distributed.recv(shape, src=src_rank, group=group) # recv dtype dtype = paddle.to_tensor([0]) - paddle.distributed.recv(dtype, src=0, group=group) + paddle.distributed.recv(dtype, src=src_rank, group=group) # recv stop_gradient stop_grad = paddle.to_tensor([0]) - paddle.distributed.recv(stop_grad, src=0, group=group) + paddle.distributed.recv(stop_grad, src=src_rank, group=group) return shape.numpy().tolist(), dtype.item(), stop_grad.item() def recv_meta(self, group): tensor_type = paddle.to_tensor([0]) - paddle.distributed.recv(tensor_type, src=0, group=group) + src_rank = group.ranks[0] + + paddle.distributed.recv(tensor_type, src=src_rank, group=group) tensor_type = tensor_type.item() if tensor_type == 0: @@ -83,7 +87,7 @@ def recv_meta(self, group): elif tensor_type == 1: num = paddle.to_tensor([0]) - paddle.distributed.recv(num, src=0, group=group) + paddle.distributed.recv(num, src=src_rank, group=group) num = num.item() shapes = [] dtypes = [] @@ -101,34 +105,38 @@ def recv_meta(self, group): def _send_dims_shape_dtype(self, tensor, group): # send len(shape) dims = paddle.to_tensor(len(tensor.shape)) - paddle.distributed.send(dims, dst=1, group=group) + dst_rank = group.ranks[1] + + paddle.distributed.send(dims, dst=dst_rank, group=group) # send shape shape = paddle.to_tensor(tensor.shape) - paddle.distributed.send(shape, dst=1, group=group) + paddle.distributed.send(shape, dst=dst_rank, group=group) # send dtype dtype = paddle.to_tensor(paddle_2_number(tensor.dtype)) - paddle.distributed.send(dtype, dst=1, group=group) + paddle.distributed.send(dtype, dst=dst_rank, group=group) # send trainable stop_grad = paddle.to_tensor(int(tensor.stop_gradient)) - paddle.distributed.send(stop_grad, dst=1, group=group) + paddle.distributed.send(stop_grad, dst=dst_rank, group=group) def send_meta(self, tensor, group): + dst_rank = group.ranks[1] + if isinstance(tensor, (paddle.Tensor, core.eager.Tensor)): tensor_type = paddle.to_tensor([0]) # send tensor type - paddle.distributed.send(tensor_type, dst=1, group=group) + paddle.distributed.send(tensor_type, dst=dst_rank, group=group) self._send_dims_shape_dtype(tensor, group) elif isinstance(tensor, tuple): tensor_type = paddle.to_tensor([1]) # send tensor type - paddle.distributed.send(tensor_type, dst=1, group=group) + paddle.distributed.send(tensor_type, dst=dst_rank, group=group) nums = paddle.to_tensor(len(tensor)) - paddle.distributed.send(nums, dst=1, group=group) + paddle.distributed.send(nums, dst=dst_rank, group=group) for d in tensor: assert isinstance(d, (paddle.Tensor, core.eager.Tensor)) @@ -166,6 +174,7 @@ def send_partial(tensor, rank_id=0, group=None, use_calc_stream=True): + # dst: local rank in group if group is not None and not group.is_member(): return ring_id = 0 if group is None else group.id @@ -176,7 +185,7 @@ def send_partial(tensor, dst, 'num', nranks, 'id', rank_id) else: return paddle.distributed.send(tensor.detach(), - dst=dst, + dst=group.ranks[dst], group=group, use_calc_stream=use_calc_stream) @@ -187,6 +196,7 @@ def recv_partial(tensor, rank_id=0, group=None, use_calc_stream=True): + # src: local rank in group if group is not None and not group.is_member(): return ring_id = 0 if group is None else group.id @@ -198,7 +208,7 @@ def recv_partial(tensor, tensor.shape) else: paddle.distributed.recv(tensor.detach(), - src=src, + src=group.ranks[src], group=group, use_calc_stream=use_calc_stream) From fd6b1a02435d61bcadf812d9f3b16a91f85f0adf Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Wed, 13 Jul 2022 16:08:25 +0800 Subject: [PATCH 177/250] Add sparse.coalesce (#44256) * add sparse api coalesce --- paddle/phi/api/yaml/sparse_api.yaml | 7 ++++ .../{coalesced_kernel.h => coalesce_kernel.h} | 13 +++++-- ...coalesced_kernel.cc => coalesce_kernel.cc} | 22 ++++++------ ...coalesced_kernel.cu => coalesce_kernel.cu} | 35 +++++++------------ .../phi/kernels/sparse/sparse_utils_kernel.h | 6 ++-- .../kernels/test_sparse_conv3d_dev_api.cc | 7 ++-- .../tests/kernels/test_sparse_pool_dev_api.cc | 6 ++-- .../tests/unittests/test_sparse_conv_op.py | 1 + .../tests/unittests/test_sparse_utils_op.py | 2 ++ python/paddle/incubate/sparse/__init__.py | 2 ++ python/paddle/incubate/sparse/unary.py | 31 ++++++++++++++++ 11 files changed, 88 insertions(+), 44 deletions(-) rename paddle/phi/kernels/sparse/{coalesced_kernel.h => coalesce_kernel.h} (71%) rename paddle/phi/kernels/sparse/cpu/{coalesced_kernel.cc => coalesce_kernel.cc} (87%) rename paddle/phi/kernels/sparse/gpu/{coalesced_kernel.cu => coalesce_kernel.cu} (87%) diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index d8c275ff1f2e6..4c513ed7d2edd 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -266,6 +266,13 @@ layout : x backward : values_grad +- api: coalesce + args : (Tensor x) + output : Tensor(out) + kernel : + func: coalesce{sparse_coo -> sparse_coo} + layout : x + - api: full_like args : (Tensor x, Scalar value, DataType dtype=DataType::UNDEFINED) output : Tensor(out) diff --git a/paddle/phi/kernels/sparse/coalesced_kernel.h b/paddle/phi/kernels/sparse/coalesce_kernel.h similarity index 71% rename from paddle/phi/kernels/sparse/coalesced_kernel.h rename to paddle/phi/kernels/sparse/coalesce_kernel.h index 0755579a57ade..cb8b98fd87404 100644 --- a/paddle/phi/kernels/sparse/coalesced_kernel.h +++ b/paddle/phi/kernels/sparse/coalesce_kernel.h @@ -22,9 +22,16 @@ namespace phi { namespace sparse { template -void CoalescedKernel(const Context& dev_ctx, - const SparseCooTensor& x, - SparseCooTensor* out); +void CoalesceKernel(const Context& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out); + +template +SparseCooTensor Coalesce(const Context& dev_ctx, const SparseCooTensor& x) { + SparseCooTensor coo; + CoalesceKernel(dev_ctx, x, &coo); + return coo; +} } // namespace sparse } // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc similarity index 87% rename from paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc rename to paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc index 9d1f71afceb5e..95d8abd6bcf5c 100644 --- a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/coalesce_kernel.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/coalesced_kernel.h" +#include "paddle/phi/kernels/sparse/coalesce_kernel.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/visit_type.h" @@ -22,9 +22,9 @@ namespace phi { namespace sparse { template -void CoalescedCPUKernel(const CPUContext& dev_ctx, - const SparseCooTensor& x, - SparseCooTensor* out) { +void CoalesceCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { const DenseTensor& x_indices = x.non_zero_indices(); const DenseTensor& x_values = x.non_zero_elements(); DenseTensor out_indices = phi::EmptyLike(dev_ctx, x_indices); @@ -95,22 +95,22 @@ void CoalescedCPUKernel(const CPUContext& dev_ctx, } template -void CoalescedKernel(const Context& dev_ctx, - const SparseCooTensor& x, - SparseCooTensor* out) { +void CoalesceKernel(const Context& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "CoalescedCPUKernel", ([&] { - CoalescedCPUKernel(dev_ctx, x, out); + x.non_zero_indices().dtype(), "CoalesceCPUKernel", ([&] { + CoalesceCPUKernel(dev_ctx, x, out); })); } } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sort, +PD_REGISTER_KERNEL(coalesce, CPU, ALL_LAYOUT, - phi::sparse::CoalescedKernel, + phi::sparse::CoalesceKernel, float, double, phi::dtype::float16, diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu similarity index 87% rename from paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu rename to paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu index 405384009df89..f6aedb8b68fc3 100644 --- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/coalesced_kernel.h" +#include "paddle/phi/kernels/sparse/coalesce_kernel.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" @@ -27,9 +27,9 @@ namespace phi { namespace sparse { template -void CoalescedGPUKernel(const GPUContext& dev_ctx, - const SparseCooTensor& x, - SparseCooTensor* out) { +void CoalesceGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { const DenseTensor& x_indices = x.non_zero_indices(); const DenseTensor& x_values = x.non_zero_elements(); DenseTensor out_indices = phi::EmptyLike(dev_ctx, x_indices); @@ -55,11 +55,7 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx, phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), sparse_offsets.data(), sizeof(IntT) * sparse_dim, -#ifdef PADDLE_WITH_HIP - hipMemcpyHostToDevice, -#else - cudaMemcpyHostToDevice, -#endif + gpuMemcpyHostToDevice, dev_ctx.stream()); // 1. flatten indices @@ -117,11 +113,7 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx, phi::backends::gpu::GpuMemcpyAsync(&out_nnz, out_indices.data(), sizeof(IntT), -#ifdef PADDLE_WITH_HIP - hipMemcpyDeviceToHost, -#else - cudaMemcpyDeviceToHost, -#endif + gpuMemcpyDeviceToHost, dev_ctx.stream()); dev_ctx.Wait(); @@ -161,22 +153,21 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx, } template -void CoalescedKernel(const Context& dev_ctx, - const SparseCooTensor& x, - SparseCooTensor* out) { +void CoalesceKernel(const Context& dev_ctx, + const SparseCooTensor& x, + SparseCooTensor* out) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "CoalescedGPUKernel", ([&] { - CoalescedGPUKernel(dev_ctx, x, out); + x.non_zero_indices().dtype(), "CoalesceGPUKernel", ([&] { + CoalesceGPUKernel(dev_ctx, x, out); })); } - } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sort, +PD_REGISTER_KERNEL(coalesce, GPU, ALL_LAYOUT, - phi::sparse::CoalescedKernel, + phi::sparse::CoalesceKernel, float, double, phi::dtype::float16, diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h index 93abf70b24412..12d55596a935d 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/sparse/coalesced_kernel.h" namespace phi { namespace sparse { @@ -154,9 +153,8 @@ void SparseCooTensorKernel(const Context& dev_ctx, const DenseTensor& indices, const IntArray& dense_shape, SparseCooTensor* out) { - SparseCooTensor before_coalesced( - indices, values, phi::make_ddim(dense_shape.GetData())); - CoalescedKernel(dev_ctx, before_coalesced, out); + *out = + SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData())); } } // namespace sparse diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index f08c7b0872b93..2efdd47998073 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/sparse/coalesce_kernel.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" @@ -207,6 +208,8 @@ void TestConv3dBase(const std::vector& indices, subm, &d_rulebook); + SparseCooTensor tmp_d_out = sparse::Coalesce(dev_ctx_gpu, d_out); + ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); for (int i = 0; i < correct_out_dims.size(); i++) { @@ -217,7 +220,7 @@ void TestConv3dBase(const std::vector& indices, dev_ctx_cpu, DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW)); phi::Copy(dev_ctx_gpu, - d_out.non_zero_indices(), + tmp_d_out.non_zero_indices(), phi::CPUPlace(), true, &h_indices_tensor); @@ -231,7 +234,7 @@ void TestConv3dBase(const std::vector& indices, phi::EmptyLike(dev_ctx_cpu, d_out.non_zero_elements()); phi::Copy(dev_ctx_gpu, - d_out.non_zero_elements(), + tmp_d_out.non_zero_elements(), phi::CPUPlace(), true, &h_features_tensor); diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc index 460dca59c718c..eeba9cdc131d8 100644 --- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/sparse/coalesce_kernel.h" #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" #include "paddle/phi/kernels/sparse/sparse_pool_kernel.h" @@ -157,6 +158,7 @@ void TestMaxPoolBase(const std::vector& indices, dilations, strides, &d_rulebook); + SparseCooTensor tmp_d_out = sparse::Coalesce(dev_ctx_gpu, d_out); ASSERT_EQ(correct_out_dims.size(), d_out.dims().size()); ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz()); @@ -168,7 +170,7 @@ void TestMaxPoolBase(const std::vector& indices, dev_ctx_cpu, DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW)); phi::Copy(dev_ctx_gpu, - d_out.non_zero_indices(), + tmp_d_out.non_zero_indices(), phi::CPUPlace(), true, &h_indices_tensor); @@ -182,7 +184,7 @@ void TestMaxPoolBase(const std::vector& indices, phi::EmptyLike(dev_ctx_cpu, d_out.non_zero_elements()); phi::Copy(dev_ctx_gpu, - d_out.non_zero_elements(), + tmp_d_out.non_zero_elements(), phi::CPUPlace(), true, &h_features_tensor); diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py index e1a9b2428babc..9501b2c89531f 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py @@ -53,6 +53,7 @@ def test_conv3d(self): groups=1, data_format="NDHWC") out.backward(out) + out = paddle.incubate.sparse.coalesce(out) assert np.array_equal(correct_out_values, out.values().numpy()) def test_subm_conv3d(self): diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py index ac69469cbbd69..53c84c9d1f66a 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py @@ -298,6 +298,7 @@ def test_sparse_coo_tensor_sorted(self): values = paddle.to_tensor(values, dtype='float32') sparse_x = paddle.incubate.sparse.sparse_coo_tensor( indices, values) + sparse_x = paddle.incubate.sparse.coalesce(sparse_x) indices_sorted = [[0, 1], [1, 0]] values_sorted = [5.0, 1.0] assert np.array_equal(indices_sorted, @@ -310,6 +311,7 @@ def test_sparse_coo_tensor_sorted(self): values = paddle.to_tensor(values, dtype='float32') sparse_x = paddle.incubate.sparse.sparse_coo_tensor( indices, values) + sparse_x = paddle.incubate.sparse.coalesce(sparse_x) values_sorted = [[5.0, 5.0], [1.0, 1.0]] assert np.array_equal(indices_sorted, sparse_x.indices().numpy()) diff --git a/python/paddle/incubate/sparse/__init__.py b/python/paddle/incubate/sparse/__init__.py index c56ada3468acc..47c7a312e24d8 100644 --- a/python/paddle/incubate/sparse/__init__.py +++ b/python/paddle/incubate/sparse/__init__.py @@ -30,6 +30,7 @@ from .unary import pow from .unary import cast from .unary import neg +from .unary import coalesce from .binary import mv from .binary import matmul @@ -66,4 +67,5 @@ 'subtract', 'multiply', 'divide', + 'coalesce', ] diff --git a/python/paddle/incubate/sparse/unary.py b/python/paddle/incubate/sparse/unary.py index d3fb55b73757a..1725c8791fd30 100644 --- a/python/paddle/incubate/sparse/unary.py +++ b/python/paddle/incubate/sparse/unary.py @@ -472,3 +472,34 @@ def abs(x, name=None): """ return _C_ops.final_state_sparse_abs(x) + + +@dygraph_only +def coalesce(x): + r""" + the coalesced operator include sorted and merge, after coalesced, the indices of x is sorted and unique. + + Parameters: + x (Tensor): the input SparseCooTensor. + + Returns: + Tensor: return the SparseCooTensor after coalesced. + + Examples: + .. code-block:: python + + import paddle + from paddle.incubate import sparse + from paddle.fluid.framework import _test_eager_guard + + with _test_eager_guard(): + indices = [[0, 0, 1], [1, 1, 2]] + values = [1.0, 2.0, 3.0] + sp_x = sparse.sparse_coo_tensor(indices, values) + sp_x = sparse.coalesce(sp_x) + print(sp_x.indices()) + #[[0, 1], [1, 2]] + print(sp_x.values()) + #[3.0, 3.0] + """ + return _C_ops.final_state_sparse_coalesce(x) From 42468de17316273b52a968efc84936793af03d03 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 13 Jul 2022 03:23:37 -0500 Subject: [PATCH 178/250] remove approval check for allocation folder, test=document_fix (#44301) --- tools/check_file_diff_approvals.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index 9ed85c699d1e9..55b55faabf993 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -238,12 +238,6 @@ if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 328693 6836917 39303645 fi -HAS_MODIFIED_ALLOCATION=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/memory/allocation" || true` -if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - echo_line="You must be approved by zhiqiu and Shixiaowei02 for paddle/fluid/memory/allocation.\nIt is being modularized and refactored. Thanks!\n" - check_approval 1 6888866 39303645 - fi - HAS_MODIFIED_DECLARATIONS=`git diff -U0 upstream/$BRANCH |grep "^+" |grep "paddle/phi/kernels/declarations.h" || true` if [ "${HAS_MODIFIED_DECLARATIONS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="You must be approved by chenwhql or zyfncg for paddle/phi/kernels/declarations.h using. Thanks!\n" From 7dc7fc4bd8352b1c53f71b3b0720e20912d2598d Mon Sep 17 00:00:00 2001 From: caozhou <48191911+Caozhou1995@users.noreply.github.com> Date: Wed, 13 Jul 2022 16:33:02 +0800 Subject: [PATCH 179/250] [Auto Parallel] Add comm init control by socket (#44148) * add comm init control by socket * avoid single card instance failure --- .../distributed/auto_parallel/engine.py | 64 ++++++++++++++++++- .../auto_parallel/process_group.py | 25 +++++--- 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index e65a51a09a16f..4fd1ca3114a82 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -15,6 +15,7 @@ import copy import logging from collections import defaultdict +import socket import paddle import paddle.utils as utils @@ -36,7 +37,8 @@ from paddle.distributed.utils import get_logger from paddle.distributed.passes import new_pass, PassContext -# from .cluster import Cluster, get_default_cluster +from ..collective import _get_global_env +from .cluster import Cluster, get_default_cluster from .planner_v2 import Planner from .parallelizer_v2 import Parallelizer from .dist_op import DistributedOperator @@ -60,8 +62,8 @@ def __init__(self, self.inputs_spec = self._validate_spec(inputs_spec) self.labels_spec = self._validate_spec(labels_spec) self.cluster = cluster - # if self.cluster is None: - # self.cluster = get_default_cluster() + if self.cluster is None: + self.cluster = get_default_cluster() self.strategy = strategy if self.strategy is None: self.strategy = fleet.DistributedStrategy() @@ -314,10 +316,66 @@ def _initialize(self, mode): # Traverse different rank programs and traverse each op of them, # instantiate communication by process_mapping. all_process_groups = get_all_process_groups() + + has_recv_by_socket = [] + # This is a magic number and the rank number for training is usually less than 5000 + magic_num = 5000 + genv = _get_global_env() + cur_rank_ip, cur_rank_port = genv.current_endpoint.split(":") + cur_rank_recv_port = int(cur_rank_port) + magic_num + server_socket = None + # Large enough for recv rank + buff_size = 1024 + server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server_socket.bind((cur_rank_ip, cur_rank_recv_port)) + # The 10 is an empirical value + server_socket.listen(10) + client_sockets = {} for process_group in all_process_groups: if self._cur_rank not in process_group.ranks: continue + if len(process_group.ranks) == 2: + index = process_group.ranks.index(self._cur_rank) + is_send = True if index == 0 else False + if is_send: + recv_rank = process_group.ranks[1] + recv_rank_ip, recv_rank_port = genv.trainer_endpoints[ + recv_rank].split(":") + connect_port = int(recv_rank_port) + magic_num + client_socket = socket.socket(socket.AF_INET, + socket.SOCK_STREAM) + client_socket.connect((recv_rank_ip, connect_port)) + client_socket.send(str(self._cur_rank).encode('utf-8')) + rank = client_socket.recv(buff_size).decode('utf-8') + rank = int(rank) + if rank != recv_rank: + raise ValueError( + "Please check comm pair, the recv rank should be {} but got {}." + .format(recv_rank, rank)) + else: + print("It is able to instantiate {} as sender now.". + format(process_group.ranks)) + client_socket.close() + else: + send_rank = process_group.ranks[0] + while True: + if send_rank not in has_recv_by_socket: + client_socket, recv_addr = server_socket.accept( + ) + rank = int( + client_socket.recv(buff_size).decode()) + client_sockets[rank] = client_socket + has_recv_by_socket.append(rank) + else: + client_sockets[send_rank].send( + str(self._cur_rank).encode("utf-8")) + client_sockets[send_rank].close() + print( + "It is able to instantiate {} as recver now." + .format(process_group.ranks)) + break process_group.instantiate() + server_socket.close() self._place = _get_device() if isinstance(self._place, fluid.CUDAPlace): diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py index d583dcb32eb22..74cb6930e0392 100644 --- a/python/paddle/distributed/auto_parallel/process_group.py +++ b/python/paddle/distributed/auto_parallel/process_group.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License +from collections import OrderedDict + import paddle import paddle.fluid.core as core from ..collective import _get_global_env @@ -130,16 +132,23 @@ def instantiate(self): else: assert False, ("No CUDA device found") - # TODO(shenliang03): This is a temporary solution to solve the problem of - # hang caused by cross-creation of new_group - tmp = paddle.to_tensor( - [1], dtype="int32") if _non_static_mode() else fill_constant( - [0], dtype="int32", value="1") - paddle.distributed.all_reduce(tmp, use_calc_stream=True) - paddle.distributed.wait(tmp) + # TODO(shenliang03): This is a temporary solution to solve the problem of + # hang caused by cross-creation of new_group + paddle.framework._in_legacy_dygraph() + paddle.set_device('gpu:%d' % + paddle.distributed.ParallelEnv().dev_id) + tmp = paddle.to_tensor( + [1], dtype="int32") if _non_static_mode() else fill_constant( + [0], dtype="int32", value="1") + paddle.distributed.all_reduce(tmp, use_calc_stream=True, group=self) + paddle.distributed.wait(tmp, group=self) + paddle.enable_static() self._is_instantiate = True + def is_member(self): + return True + # def __eq__(self, other): # if not isinstance(other, ProcessGroup): # return False @@ -158,5 +167,5 @@ def __str__(self): # Note that Process group 0 is reserved for representing all ranks. # At the beginning, group 0 is empty and new ranks will be added automatically. -_g_process_group_map = {} +_g_process_group_map = OrderedDict() _g_process_group_map[0] = ProcessGroup(0, []) From daa6cb921c24c6dddb86293b0925612c636e96f6 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Wed, 13 Jul 2022 16:41:07 +0800 Subject: [PATCH 180/250] [CustomKernel] phi capi add inference support (#44268) --- paddle/fluid/inference/CMakeLists.txt | 4 ++++ paddle/fluid/inference/api/CMakeLists.txt | 3 +++ paddle/fluid/inference/api/api.cc | 4 ++++ paddle/fluid/inference/paddle_inference_custom_device.map | 1 + paddle/phi/capi/include/c_tensor.h | 2 +- paddle/phi/capi/include/wrapper_base.h | 2 +- paddle/phi/capi/lib/c_tensor.cc | 2 +- 7 files changed, 15 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 6ff4655429604..7f2daa942b057 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -40,6 +40,10 @@ get_property(phi_modules GLOBAL PROPERTY PHI_MODULES) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) set(utils_modules stringpiece pretty_log string_helper benchmark) +if(WITH_CUSTOM_DEVICE) + set(fluid_modules ${fluid_modules} phi_capi) +endif() + add_subdirectory(api) # Create static inference library if needed diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 9e601df8088fc..3aff5d5536a23 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -55,6 +55,9 @@ set(paddle_inference_api_deps if(WITH_CRYPTO) list(APPEND paddle_inference_api_deps paddle_crypto) endif() +if(WITH_CUSTOM_DEVICE) + set(paddle_inference_api_deps ${paddle_inference_api_deps} phi_capi) +endif() cc_library( paddle_inference_api diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index d5897e3c4f2a7..054b4668c4cc6 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -156,3 +156,7 @@ std::shared_ptr MakeCipher(const std::string &config_file) { #endif } // namespace paddle + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/capi/capi.h" +#endif diff --git a/paddle/fluid/inference/paddle_inference_custom_device.map b/paddle/fluid/inference/paddle_inference_custom_device.map index 52bc2870482e2..d78860e0a2070 100644 --- a/paddle/fluid/inference/paddle_inference_custom_device.map +++ b/paddle/fluid/inference/paddle_inference_custom_device.map @@ -5,6 +5,7 @@ *profile*; *phi*; *FLAGS_*; + PD_*; local: *; }; diff --git a/paddle/phi/capi/include/c_tensor.h b/paddle/phi/capi/include/c_tensor.h index 35ac7dda3964d..2bebee977740b 100644 --- a/paddle/phi/capi/include/c_tensor.h +++ b/paddle/phi/capi/include/c_tensor.h @@ -24,7 +24,7 @@ extern "C" { typedef struct PD_Tensor PD_Tensor; -PD_DataType PD_TensorGetDataType(const PD_Tensor *tensor, PD_Status *status); +PD_DataType PD_TensorGetPDDataType(const PD_Tensor *tensor, PD_Status *status); PD_DataLayout PD_TensorGetDataLayout(const PD_Tensor *tensor, PD_Status *status); diff --git a/paddle/phi/capi/include/wrapper_base.h b/paddle/phi/capi/include/wrapper_base.h index 2b5421bc266cf..adfb2b5a0e050 100644 --- a/paddle/phi/capi/include/wrapper_base.h +++ b/paddle/phi/capi/include/wrapper_base.h @@ -128,7 +128,7 @@ class DenseTensor : public WrapperBase { PD_DataType dtype() const { C_Status status; - auto data_type = PD_TensorGetDataType(raw_data(), &status); + auto data_type = PD_TensorGetPDDataType(raw_data(), &status); PD_CHECK_STATUS(status); return data_type; } diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc index c81eefe22f77e..b460d2e368607 100644 --- a/paddle/phi/capi/lib/c_tensor.cc +++ b/paddle/phi/capi/lib/c_tensor.cc @@ -19,7 +19,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/meta_tensor.h" -PD_DataType PD_TensorGetDataType(const PD_Tensor* tensor, PD_Status* status) { +PD_DataType PD_TensorGetPDDataType(const PD_Tensor* tensor, PD_Status* status) { if (status) { if (!tensor) { *status = C_FAILED; From 07f33da94722c9ddbae4f85a2004b0f3b79968d4 Mon Sep 17 00:00:00 2001 From: JZ-LIANG Date: Wed, 13 Jul 2022 17:00:26 +0800 Subject: [PATCH 181/250] [Auto parallel] Accelerate procedure of partitioning and generating dist graphs (#44224) * avoid sync with cpp in partition op * delay eval & predict mode * bugfix for gradient merge pass --- .../distributed/auto_parallel/engine.py | 33 +++++++++++++++---- .../dist_check_finite_and_unscale.py | 4 +-- .../auto_parallel/operators/dist_default.py | 12 ++----- .../auto_parallel/operators/dist_embedding.py | 5 +-- .../dist_fill_constant_batch_size_like.py | 1 - .../auto_parallel/operators/dist_matmul.py | 6 +--- .../auto_parallel/operators/dist_pnorm.py | 8 ++--- .../auto_parallel/operators/dist_reduce_p.py | 3 +- .../auto_parallel/operators/dist_reshape.py | 12 ++----- .../operators/dist_update_loss_scaling.py | 3 +- .../auto_parallel/parallelizer_v2.py | 33 ++++++++++++++++++- .../passes/auto_parallel_gradient_merge.py | 2 +- 12 files changed, 71 insertions(+), 51 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 4fd1ca3114a82..1e1e37b4435ce 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -85,6 +85,11 @@ def __init__(self, self._feed_vars = {} self._fetch_vars = {} self._planners = {} + self._mode_init_states = { + "train": False, + "eval": False, + "predict": False + } self._dygraph_mode = False def prepare(self, @@ -101,6 +106,7 @@ def prepare(self, " or `paddle.fluid.optimizer.Optimizer`." ) self._optimizer = optimizer + self._all_ranks = all_ranks if loss and not isinstance(loss, paddle.nn.Layer) and not callable(loss): @@ -116,22 +122,23 @@ def prepare(self, metric.__class__.__name__) self._metrics = to_list(metrics) self._gradient_scale = gradient_scale - self._planned_mode = None - self._modes = ['train', 'eval', 'predict'] + self._prepare_single_mode("train") - # Build program and do auto parallel process - for mode in self._modes: - # Build forward program - self._build(mode) + def _prepare_single_mode(self, mode): + self._modes = [mode] + self._build(self._modes[0]) + # Do auto parallel process for mode in self._modes: # Do the planning process self._plan(mode) for mode in self._modes: # Do the parallel process - self._parallel(mode, all_ranks) + self._parallel(mode, self._all_ranks) + # Init comm and startup program self._initialize(mode) + self._mode_init_states[mode] = True def _build(self, mode): @@ -432,6 +439,12 @@ def fit(self, return_numpy=True): # TODO: callbacks # TODO: evaluate after training + + if not self._mode_init_states['train']: + raise Exception( + "train program is not initialized yet, please call engine.prepare() before calling fit() funtion." + ) + self.mode = 'train' assert self.mode in self._dist_main_progs, \ "train model is not ready, please call `engine.prepare()` first." @@ -467,6 +480,9 @@ def evaluate(self, use_program_cache=False, return_numpy=True): self.mode = 'eval' + if not self._mode_init_states[self.mode]: + self._prepare_single_mode(self.mode) + assert self.mode in self._dist_main_progs, \ "eval model is not ready, please call `engine.prepare()` first." eval_dataloader = self._create_dataloader(eval_data, batch_size) @@ -509,6 +525,9 @@ def predict(self, use_program_cache=False, return_numpy=True): self.mode = 'predict' + if not self._mode_init_states[self.mode]: + self._prepare_single_mode(self.mode) + assert self.mode in self._dist_main_progs, \ "predict model is not ready, please call `engine.prepare()` first." test_dataloader = self._create_dataloader(test_data, batch_size) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py index b00f1a589e312..108b99fdce613 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_check_finite_and_unscale.py @@ -113,12 +113,11 @@ def backward(ctx, *args, **kwargs): filter_vars.append(varname) # replicate op in dist program - dist_op_desc = main_block.desc.append_op() + dist_op_desc = main_block.append_op(type='nop').desc dist_op_desc.copy_from(backward_op.desc) set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx) dist_op_desc.set_input('X', filter_vars) dist_op_desc.set_output('Out', filter_vars) - main_block._sync_with_cpp() # sync result group = new_process_group(world_process_group.ranks) @@ -155,7 +154,6 @@ def backward(ctx, *args, **kwargs): "out_dtype": inf_var.dtype, OP_ROLE_KEY: OpRole.Optimize }) - main_block._sync_with_cpp() for op in [cast_op1, allreduce_op, cast_op2]: new_op_dist_attr = OperatorDistributedAttribute() diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py index a2b1b7826d51f..9d9d5371aca3e 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_default.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py @@ -363,7 +363,7 @@ def forward(ctx, *args, **kwargs): output_name) # replicate op in dist program - dist_op_desc = main_block.desc.append_op() + dist_op_desc = main_block.append_op(type='nop').desc dist_op_desc.copy_from(src_op.desc) set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx) for input_name in src_op.desc.input_names(): @@ -371,8 +371,6 @@ def forward(ctx, *args, **kwargs): for output_name in src_op.desc.output_names(): dist_op_desc.set_output(output_name, kwargs[output_name]) - main_block._sync_with_cpp() - # data parallel synchronization for primtive operators from paddle.incubate.autograd import prim_enabled if prim_enabled(): @@ -431,8 +429,6 @@ def forward(ctx, *args, **kwargs): op_attr.set_input_dims_mapping(param.name, dims_mapping) ctx.set_op_dist_attr_for_program(new_op, op_attr) - startup_block._sync_with_cpp() - @staticmethod def backward(ctx, *args, **kwargs): @@ -461,7 +457,7 @@ def backward(ctx, *args, **kwargs): output_name) # replicate op in dist program - dist_op_desc = main_block.desc.append_op() + dist_op_desc = main_block.append_op(type='nop').desc dist_op_desc.copy_from(backward_op.desc) # Refer to the related dist op set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx) @@ -470,8 +466,6 @@ def backward(ctx, *args, **kwargs): for output_name in backward_op.desc.output_names(): dist_op_desc.set_output(output_name, kwargs[output_name]) - main_block._sync_with_cpp() - # check if need gradient allreduce # if there is a non-gradient & non-parameter input and its batch dimension is splited, # we need insert gradient allreduce for the gradient of parameter in its output @@ -552,8 +546,6 @@ def backward(ctx, *args, **kwargs): dims_mapping) ctx.set_op_dist_attr_for_program(op, op_attr) - main_block._sync_with_cpp() - register_distributed_operator_impl( "default", DistributedDefaultImpl0("replicate_parallel")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py index 80c9b8641ba36..aa463398139ba 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py @@ -312,7 +312,6 @@ def forward(ctx, *args, **kwargs): 'use_calc_stream': True, OP_ROLE_KEY: OpRole.Forward }) - startup_block._sync_with_cpp() @staticmethod def backward(ctx, *args, **kwargs): @@ -412,8 +411,7 @@ def backward(ctx, *args, **kwargs): set_comm_op_dist_attr_for_program(c_identity_op, dist_attr.process_mesh, out_grad_dist_attr, ctx) - main_block._sync_with_cpp() - c_embedding_grad_op_desc = main_block.desc.append_op() + c_embedding_grad_op_desc = main_block.append_op(type='nop').desc c_embedding_grad_op_desc.set_type("c_embedding_grad") c_embedding_grad_op_desc.set_input('Ids', [Ids_var.name]) c_embedding_grad_op_desc.set_input('W', [Weight_var.name]) @@ -422,7 +420,6 @@ def backward(ctx, *args, **kwargs): c_embedding_grad_op_desc.set_output('W@GRAD', [Weight_grad.name]) c_embedding_grad_op_desc._set_attr('start_index', relative_idx) c_embedding_grad_op_desc._set_attr(OP_ROLE_KEY, OpRole.Backward) - main_block._sync_with_cpp() c_embedding_grad_op = main_block.ops[-1] assert c_embedding_grad_op.type == "c_embedding_grad" diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py index 763e47802b333..27e8983707b72 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py @@ -118,7 +118,6 @@ def forward(ctx, *args, **kwargs): shape_list[idx] = shape_list[idx] // process_mesh_shape[axis] op._set_attr("shape", shape_list) - main_block._sync_with_cpp() @staticmethod def backward(ctx, *args, **kwargs): diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 0826148208ec0..4e9aefd168c4f 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -38,7 +38,7 @@ def copy_op_with_new_input_output(ctx, block, src_op, **kwargs): - dist_op_desc = block.desc.append_op() + dist_op_desc = block.append_op(type='nop').desc dist_op_desc.copy_from(src_op.desc) set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx) for input_name in src_op.desc.input_names(): @@ -48,7 +48,6 @@ def copy_op_with_new_input_output(ctx, block, src_op, **kwargs): assert input_name in kwargs dist_op_desc.set_output(output_name, kwargs[output_name]) - block._sync_with_cpp() return dist_op_desc @@ -387,8 +386,6 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs): matmul_op_desc = copy_op_with_new_input_output(ctx, main_block, backward_op, **kwargs) - main_block._sync_with_cpp() - # check if need gradient allreduce need_gradient_allreduce = False @@ -468,7 +465,6 @@ def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id): 'use_calc_stream': True, OP_ROLE_KEY: OpRole.Forward }) - startup_block._sync_with_cpp() class DistributedMatmul(DistributedOperatorImplContainer): diff --git a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py index 4629e4bef930e..7eea4bea49f35 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_pnorm.py @@ -248,7 +248,7 @@ def forward(ctx, *args, **kwargs): # rename input kwargs['X'] = [allgather_out.name] # replicate op in dist program - dist_op_desc = main_block.desc.append_op() + dist_op_desc = main_block.append_op(type='nop').desc dist_op_desc.copy_from(src_op.desc) set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx) for input_name in src_op.desc.input_names(): @@ -260,8 +260,6 @@ def forward(ctx, *args, **kwargs): allgather_out.name, allgather_out_dist_attr.dims_mapping) ctx.set_op_dist_attr_for_program(pnorm_op, op_dist_attr) - main_block._sync_with_cpp() - @staticmethod def backward(ctx, *args, **kwargs): @@ -305,7 +303,7 @@ def backward(ctx, *args, **kwargs): new_X_var_dist_attr = ctx.get_tensor_dist_attr_for_program(new_X_var) ctx.set_tensor_dist_attr_for_program(new_X_grad, new_X_var_dist_attr) # replicate op in dist program with new kwargs - dist_op_desc = main_block.desc.append_op() + dist_op_desc = main_block.append_op(type='nop').desc dist_op_desc.copy_from(backward_op.desc) # Refer to the related dist op set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx) @@ -319,7 +317,6 @@ def backward(ctx, *args, **kwargs): op_dist_attr.set_output_dims_mapping(new_X_grad.name, new_X_var_dist_attr.dims_mapping) ctx.set_op_dist_attr_for_program(p_norm_grad_op, op_dist_attr) - main_block._sync_with_cpp() # 2. insert slice op process_mesh_shape = op_dist_attr.process_mesh.topology @@ -359,7 +356,6 @@ def backward(ctx, *args, **kwargs): slice_op_dist_attr.set_output_dims_mapping(X_grad_var.name, X_grad_var_dims_mapping) ctx.set_op_dist_attr_for_program(slice_op, slice_op_dist_attr) - main_block._sync_with_cpp() register_distributed_operator_impl("p_norm", diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py b/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py index 6d750562c96d9..bdd105ef64c30 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py @@ -109,14 +109,13 @@ def forward(ctx, *args, **kwargs): output_name) # replicate op in dist program - dist_op_desc = main_block.desc.append_op() + dist_op_desc = main_block.append_op(type='nop').desc dist_op_desc.copy_from(src_op.desc) set_dist_op_desc_original_id(dist_op_desc, src_op.desc, ctx) for input_name in src_op.desc.input_names(): dist_op_desc.set_input(input_name, kwargs[input_name]) for output_name in src_op.desc.output_names(): dist_op_desc.set_output(output_name, kwargs[output_name]) - main_block._sync_with_cpp() # batch dimension synchronization var_name = src_op.output_arg_names[0] diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py index 47a783a5f6d71..790e97cf4e170 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_reshape.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_reshape.py @@ -177,7 +177,7 @@ def forward(ctx, *args, **kwargs): idx] = shape_list[idx] // process_mesh_shape[axis] # create op - new_op_desc = main_block.desc.append_op() + new_op_desc = main_block.append_op(type='nop').desc new_op_desc.copy_from(src_op.desc) set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx) new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) @@ -187,8 +187,6 @@ def forward(ctx, *args, **kwargs): new_op_desc.set_output('Out', [Out_var.name]) new_op_desc._set_attr('shape', shape_list) - main_block._sync_with_cpp() - @staticmethod def backward(ctx, *args, **kwargs): DistributedDefaultImpl0.backward(ctx, *args, **kwargs) @@ -335,7 +333,7 @@ def forward(ctx, *args, **kwargs): idx] = shape_list[idx] // process_mesh_shape[axis] # create op - new_op_desc = main_block.desc.append_op() + new_op_desc = main_block.append_op(type='nop').desc new_op_desc.copy_from(src_op.desc) set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx) new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) @@ -345,8 +343,6 @@ def forward(ctx, *args, **kwargs): new_op_desc.set_output('Out', [Out_var.name]) new_op_desc._set_attr('shape', shape_list) - main_block._sync_with_cpp() - @staticmethod def backward(ctx, *args, **kwargs): DistributedDefaultImpl0.backward(ctx, *args, **kwargs) @@ -486,7 +482,7 @@ def forward(ctx, *args, **kwargs): idx] = shape_list[idx] // process_mesh_shape[axis] # create op - new_op_desc = main_block.desc.append_op() + new_op_desc = main_block.append_op(type='nop').desc new_op_desc.copy_from(src_op.desc) set_dist_op_desc_original_id(new_op_desc, src_op.desc, ctx) new_op_desc.set_input('ShapeTensor', ShapeTensor_var_list) @@ -496,8 +492,6 @@ def forward(ctx, *args, **kwargs): new_op_desc.set_output('Out', [Out_var.name]) new_op_desc._set_attr('shape', shape_list) - main_block._sync_with_cpp() - @staticmethod def backward(ctx, *args, **kwargs): DistributedDefaultImpl0.backward(ctx, *args, **kwargs) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py index 9666f882200e5..cbbcaef5ee47f 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_update_loss_scaling.py @@ -127,12 +127,11 @@ def backward(ctx, *args, **kwargs): filter_vars.append(varname) # replicate op in dist program - dist_op_desc = main_block.desc.append_op() + dist_op_desc = main_block.append_op(type='nop').desc dist_op_desc.copy_from(backward_op.desc) set_dist_op_desc_original_id(dist_op_desc, backward_op.desc, ctx) dist_op_desc.set_input('X', filter_vars) dist_op_desc.set_output('Out', filter_vars) - main_block._sync_with_cpp() register_distributed_operator_impl( diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py index d8c0da9e27056..005e51dfce723 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py +++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py @@ -13,6 +13,8 @@ # limitations under the License. import copy +import time +import logging from collections import defaultdict import paddle @@ -20,6 +22,7 @@ from paddle.fluid.backward import append_backward from paddle.fluid.framework import _non_static_mode from paddle.distributed.passes import new_pass +from paddle.distributed.utils import get_logger from .reshard import Resharder from .partitioner import Partitioner @@ -41,6 +44,7 @@ def __init__(self, mode, completer, dist_context): assert self._dist_context._is_initialized self._pass_context = self._dist_context.pass_context self._strategy = self._dist_context.strategy + self._logger = get_logger(logging.INFO) def parallel_all(self): world_process_group = get_world_process_group() @@ -61,38 +65,65 @@ def parallel(self, rank): serial_startup_program, serial_loss) # Apply pre optimization passes + time0 = time.time() self._apply_pre_optimization(serial_main_program, serial_startup_program, serial_loss, serial_optimizer, params_grads) - + self._logger.info( + "within parallel apply_pre_optimization time: {}, mode {}". + format(time.time() - time0, self._mode)) # Do logical partition + time0 = time.time() partitioner = Partitioner(self._dist_context, rank) dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition( serial_main_program, serial_startup_program, params_grads) + self._logger.info( + "within parallel partitioner time: {}, mode {}".format( + time.time() - time0, self._mode)) # Generate optimizer + time0 = time.time() self._generate_optimizer(dist_main_prog, dist_startup_prog, serial_optimizer, dist_params_grads) + self._logger.info( + "within parallel optimizer time: {}, mode {}".format( + time.time() - time0, self._mode)) # Do reshard process + time0 = time.time() set_grad_var_shape(dist_main_prog, self._dist_context) resharder = Resharder(dist_main_prog, dist_startup_prog, rank, self._dist_context, dist_params_grads) resharder.reshard() + self._logger.info( + "within parallel reshard time: {}, mode {}".format( + time.time() - time0, self._mode)) # Apply post optimization passes + time0 = time.time() self._apply_post_optimization(dist_main_prog, dist_startup_prog, rank, dist_params_grads) + self._logger.info( + "within parallel apply_post_optimization time: {}, mode {}". + format(time.time() - time0, self._mode)) else: # Apply pre optimization passes # self._apply_pre_optimization(serial_main_program, # serial_startup_program, None, None, # None) # Do logical partition + time0 = time.time() partitioner = Partitioner(self._dist_context, rank) dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition( serial_main_program, serial_startup_program, []) # Do reshard process + self._logger.info( + "within parallel partitioner time: {}, mode {}".format( + time.time() - time0, self._mode)) + time0 = time.time() resharder = Resharder(dist_main_prog, dist_startup_prog, rank, self._dist_context, [], 1) resharder.reshard() + self._logger.info( + "within parallel reshard time: {}, mode {}".format( + time.time() - time0, self._mode)) # Clone program for test if self._mode != 'train': dist_main_prog = dist_main_prog.clone(for_test=True) diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py index 66cce97533efc..717f8fa27f2df 100644 --- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py +++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py @@ -58,7 +58,7 @@ def _remove_and_get_optimizer_op(main_program, dist_context): def _remove_op_role_var(param, grad): op_maker = core.op_proto_and_checker_maker op = grad.op - if op.has_attr(op_maker.kOpRoleVarAttrName()): + if op and op.has_attr(op_maker.kOpRoleVarAttrName()): op._remove_attr(op_maker.kOpRoleVarAttrName()) From 441606fd026aa75523a1bb403b3ddeb0a2aaa5b7 Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Wed, 13 Jul 2022 17:20:18 +0800 Subject: [PATCH 182/250] add fifth order test case (#44303) --- .../tests/unittests/autograd/test_primapi.py | 40 ++++++++++++++++++- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py index dc52c5bc33b48..ec06eda66f8e5 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py @@ -161,7 +161,7 @@ def test_third_order(self): exe = paddle.static.Executor(place) exe.run(startup) outs = exe.run(main, feed=feed, fetch_list=fetch_list) - np.allclose(outs, result) + np.testing.assert_allclose(outs, result, rtol=1e-5, atol=1e-5) paddle.incubate.autograd.disable_prim() def test_fourth_order(self): @@ -196,7 +196,43 @@ def test_fourth_order(self): exe = paddle.static.Executor(place) exe.run(startup) outs = exe.run(main, feed=feed, fetch_list=fetch_list) - np.allclose(outs, result) + np.testing.assert_allclose(outs, result, rtol=1e-5, atol=1e-5) + paddle.incubate.autograd.disable_prim() + + def test_fifth_order(self): + paddle.incubate.autograd.enable_prim() + main = paddle.static.Program() + startup = paddle.static.Program() + with paddle.static.program_guard(main, startup): + x = paddle.static.data(name='x', shape=[1], dtype='float32') + x2 = paddle.multiply(x, x) + x3 = paddle.multiply(x2, x) + x4 = paddle.multiply(x3, x) + x5 = paddle.multiply(x4, x) + x6 = paddle.multiply(x5, x) + out = x6 + x5 + + grad1, = paddle.incubate.autograd.grad([out], [x]) + grad2, = paddle.incubate.autograd.grad([grad1], [x]) + grad3, = paddle.incubate.autograd.grad([grad2], [x]) + grad4, = paddle.incubate.autograd.grad([grad3], [x]) + grad5, = paddle.incubate.autograd.grad([grad4], [x]) + + paddle.incubate.autograd.prim2orig() + + feed = { + x.name: np.array([2.]).astype('float32'), + } + fetch_list = [grad5.name] + result = [np.array([1560.0])] + + place = paddle.CPUPlace() + if paddle.device.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + exe.run(startup) + outs = exe.run(main, feed=feed, fetch_list=fetch_list) + np.testing.assert_allclose(outs, result, rtol=1e-5, atol=1e-5) paddle.incubate.autograd.disable_prim() def test_disable_prim(self): From 988abd6afe3ce8951dab03c9754f5a6d33a84668 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 13 Jul 2022 04:53:27 -0500 Subject: [PATCH 183/250] Skip core ops api in static check (#44284) * skip core ops api in static check, test=document_fix * polish cond, test=document_fix --- tools/print_signatures.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/print_signatures.py b/tools/print_signatures.py index 44083d660c6e1..f751709a767a5 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -292,6 +292,11 @@ def parse_args(): help="using get_all_api or from_modulelist") parser.add_argument('module', type=str, help='module', default='paddle') # not used + parser.add_argument('--skipped', + dest='skipped', + type=str, + help='Skip Checking submodules', + default='paddle.fluid.core_avx.eager.ops') if len(sys.argv) == 1: args = parser.parse_args(['paddle']) @@ -320,6 +325,8 @@ def parse_args(): all_api_names_to_k[api_name] = k all_api_names_sorted = sorted(all_api_names_to_k.keys()) for api_name in all_api_names_sorted: + if args.skipped != '' and api_name.find(args.skipped) >= 0: + continue api_info = api_info_dict[all_api_names_to_k[api_name]] print("{0} ({2}, ('document', '{1}'))".format( api_name, md5(api_info['docstring']), api_info['signature'] From ae8ca76468213739b19047bb37f2c525896b1083 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Wed, 13 Jul 2022 18:05:44 +0800 Subject: [PATCH 184/250] [Phi] Migrate matrix_solve to phi (#44298) * [Phi] Migrate matrix_solve to phi * replace mutable_data with Alloc --- paddle/fluid/operators/eig_op.h | 4 +- paddle/fluid/operators/lstsq_op.h | 2 +- paddle/fluid/operators/math/CMakeLists.txt | 1 - paddle/fluid/operators/math/matrix_solve.cc | 41 ---- .../fluid/operators/math/matrix_solve.cu.cc | 189 ------------------ paddle/fluid/operators/solve_op.h | 69 +------ paddle/phi/kernels/funcs/CMakeLists.txt | 1 + paddle/phi/kernels/funcs/matrix_solve.cc | 32 +++ paddle/phi/kernels/funcs/matrix_solve.cu | 178 +++++++++++++++++ .../math => phi/kernels/funcs}/matrix_solve.h | 108 +++++++--- 10 files changed, 302 insertions(+), 323 deletions(-) delete mode 100644 paddle/fluid/operators/math/matrix_solve.cc delete mode 100644 paddle/fluid/operators/math/matrix_solve.cu.cc create mode 100644 paddle/phi/kernels/funcs/matrix_solve.cc create mode 100644 paddle/phi/kernels/funcs/matrix_solve.cu rename paddle/{fluid/operators/math => phi/kernels/funcs}/matrix_solve.h (61%) diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index 138a987a0bd9f..82c7fe6881969 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -19,7 +19,6 @@ #include #include -#include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/complex_kernel.h" @@ -30,6 +29,7 @@ #include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_solve.h" #include "paddle/phi/kernels/funcs/slice.h" #include "paddle/phi/kernels/funcs/unsqueeze.h" #include "paddle/phi/kernels/matmul_kernel.h" @@ -366,7 +366,7 @@ void ComputeBackwardForComplexInput( int k = rhs.dims()[rhs.dims().size() - 1]; auto* matrix_data = Vh.data(); auto* rhs_data = rhs.data(); - math::SolveLinearSystem( + phi::funcs::SolveLinearSystem( matrix_data, rhs_data, x_grad_data, m, k, batch_count); } diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h index f99e027e9ced2..b3e5894a9451e 100644 --- a/paddle/fluid/operators/lstsq_op.h +++ b/paddle/fluid/operators/lstsq_op.h @@ -21,13 +21,13 @@ #include "paddle/fluid/operators/eig_op.h" #include "paddle/fluid/operators/math/eigen_values_vectors.h" -#include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_solve.h" #define EPSILON 1e-6 diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 3f7206ac08bf2..927feedd1851e 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -54,7 +54,6 @@ math_library(vol2col) math_library(prelu) math_library(bert_encoder_functor) math_library(tree2col DEPS math_function) -math_library(matrix_solve) cc_test( selected_rows_functor_test diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc deleted file mode 100644 index b0f8843a530c0..0000000000000 --- a/paddle/fluid/operators/math/matrix_solve.cc +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_solve.h" - -#include "Eigen/Core" -#include "Eigen/LU" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixSolveFunctor { - public: - void operator()(const phi::CPUContext& dev_ctx, - const framework::Tensor& a, - const framework::Tensor& b, - framework::Tensor* out) { - compute_solve_eigen(dev_ctx, a, b, out); - } -}; - -template class MatrixSolveFunctor; -template class MatrixSolveFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc deleted file mode 100644 index 41b14c07b7360..0000000000000 --- a/paddle/fluid/operators/math/matrix_solve.cu.cc +++ /dev/null @@ -1,189 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_solve.h" - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/solve_op.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace platform { -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixSolveFunctor; - -template -class MatrixSolveFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& a, - const framework::Tensor& b, - framework::Tensor* out) { -#ifndef PADDLE_WITH_HIP - - // solve the equation: Ax = B, - // use cuBlas cublasgetrfBatched funcion to performs the LU - // factorization of each matrix A, - // and then use cuBlas cublasgetriBatched function to solve the - // equation after LU factorization. - // ref: - // https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched - const auto& a_dims = a.dims(); - const int a_rank = a_dims.size(); - int n = a_dims[a_rank - 1]; - int lda = n; - int batch_size = a_rank > 2 ? a.numel() / (n * n) : 1; - - const auto& b_dims = b.dims(); - const int b_rank = b_dims.size(); - int nrhs = b_dims[b_rank - 1]; - int ldb = b_dims[b_rank - 2]; - - // make sure the out dims is right - out->Resize(b_dims); - out->mutable_data(context.GetPlace()); - - // copy input A to a temporary tensor tmp_a, - // LU factorization, written back to original matrix A, so in the beginning, - // it's necessary to create a temporary tensor tmp_a. - Tensor tmp_a(a.dtype()); - tmp_a.Resize(a.dims()); - tmp_a.mutable_data(context.GetPlace()); - framework::TensorCopy(a, context.GetPlace(), &tmp_a); - - // copy input B to a temporary tensor tmp_b, and transpose tmp_b, - // because cuBlas assumes column-major while Paddle uses row-majar. - Tensor tmp_b(b.type()); - const auto& new_dims_vec = getNewDimsVec(b_dims); - tmp_b.Resize(phi::make_ddim(new_dims_vec)); - tmp_b.mutable_data(context.GetPlace()); - phi::funcs::TransposeNormal trans; - std::vector new_axis = getNewAxis(b_rank); - trans(context, b, &tmp_b, new_axis); - - const T* a_data_in_gpu = tmp_a.data(); - const T* b_data_in_gpu = tmp_b.data(); - - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = a_data_in_gpu + i * n * n; - cpu_ptrs[i + batch_size] = b_data_in_gpu + i * n * nrhs; - } - - // Copy the addresses of A and tmp_b from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), - tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), - static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), - context.stream()); - - T** gpu_tmp_b_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - - // Allocate device memory for BatchedGETRF's info and pivots. - int num_ints = n < 32 ? batch_size : batch_size * (n + 1); - memory::allocation::AllocationPtr tmp_gpu_info_data = - memory::Alloc(context, num_ints * sizeof(int)); - int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); - - auto blas = phi::funcs::GetBlas(context); - - // only for singular checking - std::vector info; - info.resize(batch_size); - - int* gpu_pivot_ptr = - reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; - - // This function performs the LU factorization of each matrix A by the - // equation A = L * U. L and U are written back to original matrix A, - // and diagonal elements of L are discarded. - blas.BatchedGETRF(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, - gpu_info_ptr, - batch_size); - - // check whether BatchedGETRF is executed successfully or not - memory::Copy(platform::CPUPlace(), - info.data(), - context.GetPlace(), - gpu_info_ptr, - sizeof(int) * batch_size, - context.stream()); - for (int i = 0; i < batch_size; ++i) { - PADDLE_ENFORCE_EQ(info[i], - 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: U(%d, %d) is zero, singular U. " - "Please check the matrix value and change it to a " - "non-singular matrix", - i, - info[i], - info[i])); - } - - // hold the result code from BatchedGETRS - int host_info = 0; - - // to solve the equation after LU factorization - CBLAS_TRANSPOSE transA = CblasTrans; - blas.BatchedGETRS(transA, - n, - nrhs, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - lda, - gpu_pivot_ptr, - gpu_tmp_b_ptrs, - ldb, - &host_info, - batch_size); - - // check whether BatchedGETRS is executed successfully or not - PADDLE_ENFORCE_EQ(host_info, - 0, - platform::errors::InvalidArgument( - "The [%d]'th argument to cublas*getrsBatched had " - "an illegal value.", - -host_info)); - - // transpose tmp_b to get the final result in row-major form. - phi::funcs::TransposeNormal trans2; - trans2(context, tmp_b, out, new_axis); - -#else - compute_solve_eigen(context, a, b, out); -#endif - } -}; - -template class MatrixSolveFunctor; -template class MatrixSolveFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h index b97b8d01ccd37..115223749431b 100644 --- a/paddle/fluid/operators/solve_op.h +++ b/paddle/fluid/operators/solve_op.h @@ -20,11 +20,11 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" #include "paddle/fluid/operators/squeeze_op.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_solve.h" #if defined(__NVCC__) || defined(__HIPCC__) #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #endif @@ -351,7 +351,7 @@ static void linalg_solve(const framework::ExecutionContext& context, out->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - math::MatrixSolveFunctor mat_solve; + phi::funcs::MatrixSolveFunctor mat_solve; // input y can be vector or matrix // but need to be unsqueezed if y is a vector @@ -425,67 +425,6 @@ static void linalg_solve(const framework::ExecutionContext& context, } } -// for TransposeNormal -static std::vector getNewAxis(const int b_rank) { - std::vector axis_1 = {0}; - std::vector axis_2 = {1, 0}; - std::vector axis_3 = {0, 2, 1}; - std::vector axis_4 = {0, 1, 3, 2}; - std::vector axis_5 = {0, 1, 2, 4, 3}; - std::vector axis_6 = {0, 1, 2, 3, 5, 4}; - std::vector axis_7 = {0, 1, 2, 3, 4, 6, 5}; - std::vector axis_8 = {0, 1, 2, 3, 4, 5, 7, 6}; - std::vector axis_9 = {0, 1, 2, 3, 4, 5, 6, 8, 7}; - switch (b_rank) { - case 1: - return axis_1; - break; - case 2: - return axis_2; - break; - case 3: - return axis_3; - break; - case 4: - return axis_4; - break; - case 5: - return axis_5; - break; - case 6: - return axis_6; - break; - case 7: - return axis_7; - break; - case 8: - return axis_8; - break; - default: - return axis_9; - } -} - -// for Resize -static std::vector getNewDimsVec(const DDim& b_dims) { - std::vector b_dims_vec = phi::vectorize(b_dims); - int size = b_dims_vec.size(); - if (size >= 2) { - // swap the last 2 elements in b_dims_vec - int64_t temp = b_dims_vec[size - 1]; - b_dims_vec[size - 1] = b_dims_vec[size - 2]; - b_dims_vec[size - 2] = temp; - return b_dims_vec; - } - PADDLE_ENFORCE_NE( - b_dims_vec.empty(), - true, - platform::errors::PreconditionNotMet( - "The size of tensor b must not be %d after getting new dims", 0)); - // if b_dims_vec.size() == 1, just retun original vec - return b_dims_vec; -} - template class SolveKernel : public framework::OpKernel { public: @@ -553,11 +492,11 @@ class SolveGradKernel : public framework::OpKernel { tmp_dy.mutable_data(ctx.GetPlace()); Tensor tmp_input(input->dtype()); - const auto& new_dims_vec = getNewDimsVec(input->dims()); + const auto& new_dims_vec = phi::funcs::getNewDimsVec(input->dims()); tmp_input.Resize(phi::make_ddim(new_dims_vec)); tmp_input.mutable_data(ctx.GetPlace()); phi::funcs::TransposeNormal trans; - std::vector new_axis = getNewAxis(input->dims().size()); + std::vector new_axis = phi::funcs::getNewAxis(input->dims().size()); auto& dev_ctx = ctx.template device_context(); trans(dev_ctx, *input, &tmp_input, new_axis); diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 6d16fc8f81895..25696a34e3e03 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -14,3 +14,4 @@ math_library(matrix_inverse DEPS dense_tensor eigen3 blas) math_library(pooling DEPS dense_tensor) math_library(segment_pooling) math_library(sequence2batch) +math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function) diff --git a/paddle/phi/kernels/funcs/matrix_solve.cc b/paddle/phi/kernels/funcs/matrix_solve.cc new file mode 100644 index 0000000000000..31baedb3c314d --- /dev/null +++ b/paddle/phi/kernels/funcs/matrix_solve.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/funcs/matrix_solve.h" + +namespace phi { +namespace funcs { + +template +void MatrixSolveFunctor::operator()(const Context& dev_ctx, + const DenseTensor& a, + const DenseTensor& b, + DenseTensor* out) { + compute_solve_eigen(dev_ctx, a, b, out); +} + +template class MatrixSolveFunctor; +template class MatrixSolveFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/matrix_solve.cu b/paddle/phi/kernels/funcs/matrix_solve.cu new file mode 100644 index 0000000000000..fccceb7e20d2d --- /dev/null +++ b/paddle/phi/kernels/funcs/matrix_solve.cu @@ -0,0 +1,178 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/funcs/matrix_solve.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +namespace funcs { + +template +void MatrixSolveFunctor::operator()(const Context& context, + const DenseTensor& a, + const DenseTensor& b, + DenseTensor* out) { +#ifndef PADDLE_WITH_HIP + + // solve the equation: Ax = B, + // use cuBlas cublasgetrfBatched funcion to performs the LU + // factorization of each matrix A, + // and then use cuBlas cublasgetriBatched function to solve the + // equation after LU factorization. + // ref: + // https://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-getrfbatched + const auto& a_dims = a.dims(); + const int a_rank = a_dims.size(); + int n = a_dims[a_rank - 1]; + int lda = n; + int batch_size = a_rank > 2 ? a.numel() / (n * n) : 1; + + const auto& b_dims = b.dims(); + const int b_rank = b_dims.size(); + int nrhs = b_dims[b_rank - 1]; + int ldb = b_dims[b_rank - 2]; + + // make sure the out dims is right + out->Resize(b_dims); + + context.template Alloc(out); + + // copy input A to a temporary tensor tmp_a, + // LU factorization, written back to original matrix A, so in the beginning, + // it's necessary to create a temporary tensor tmp_a. + DenseTensor tmp_a(a.dtype()); + tmp_a.Resize(a.dims()); + + context.template Alloc(&tmp_a); + paddle::framework::TensorCopy(a, context.GetPlace(), &tmp_a); + + // copy input B to a temporary tensor tmp_b, and transpose tmp_b, + // because cuBlas assumes column-major while Paddle uses row-majar. + DenseTensor tmp_b(b.type()); + const auto& new_dims_vec = getNewDimsVec(b_dims); + tmp_b.Resize(phi::make_ddim(new_dims_vec)); + context.template Alloc(&tmp_b); + phi::funcs::TransposeNormal trans; + std::vector new_axis = getNewAxis(b_rank); + trans(context, b, &tmp_b, new_axis); + + const T* a_data_in_gpu = tmp_a.data(); + const T* b_data_in_gpu = tmp_b.data(); + + std::vector cpu_ptrs(batch_size * 2); + for (int i = 0; i < batch_size; ++i) { + cpu_ptrs[i] = a_data_in_gpu + i * n * n; + cpu_ptrs[i + batch_size] = b_data_in_gpu + i * n * nrhs; + } + + // Copy the addresses of A and tmp_b from host to device. + paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = + paddle::memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); + paddle::memory::Copy(context.GetPlace(), + tmp_gpu_ptrs_data->ptr(), + phi::CPUPlace(), + static_cast(cpu_ptrs.data()), + cpu_ptrs.size() * sizeof(T*), + context.stream()); + + T** gpu_tmp_b_ptrs = + reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; + + // Allocate device memory for BatchedGETRF's info and pivots. + int num_ints = n < 32 ? batch_size : batch_size * (n + 1); + paddle::memory::allocation::AllocationPtr tmp_gpu_info_data = + paddle::memory::Alloc(context, num_ints * sizeof(int)); + int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); + + auto blas = phi::funcs::GetBlas(context); + + // only for singular checking + std::vector info; + info.resize(batch_size); + + int* gpu_pivot_ptr = + reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; + + // This function performs the LU factorization of each matrix A by the + // equation A = L * U. L and U are written back to original matrix A, + // and diagonal elements of L are discarded. + blas.BatchedGETRF(n, + reinterpret_cast(tmp_gpu_ptrs_data->ptr()), + gpu_pivot_ptr, + gpu_info_ptr, + batch_size); + + // check whether BatchedGETRF is executed successfully or not + paddle::memory::Copy(phi::CPUPlace(), + info.data(), + context.GetPlace(), + gpu_info_ptr, + sizeof(int) * batch_size, + context.stream()); + for (int i = 0; i < batch_size; ++i) { + PADDLE_ENFORCE_EQ(info[i], + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: U(%d, %d) is zero, singular U. " + "Please check the matrix value and change it to a " + "non-singular matrix", + i, + info[i], + info[i])); + } + + // hold the result code from BatchedGETRS + int host_info = 0; + + // to solve the equation after LU factorization + CBLAS_TRANSPOSE transA = CblasTrans; + blas.BatchedGETRS(transA, + n, + nrhs, + reinterpret_cast(tmp_gpu_ptrs_data->ptr()), + lda, + gpu_pivot_ptr, + gpu_tmp_b_ptrs, + ldb, + &host_info, + batch_size); + + // check whether BatchedGETRS is executed successfully or not + PADDLE_ENFORCE_EQ(host_info, + 0, + phi::errors::InvalidArgument( + "The [%d]'th argument to cublas*getrsBatched had " + "an illegal value.", + -host_info)); + + // transpose tmp_b to get the final result in row-major form. + phi::funcs::TransposeNormal trans2; + trans2(context, tmp_b, out, new_axis); + +#else + compute_solve_eigen(context, a, b, out); +#endif +} + +template class MatrixSolveFunctor; +template class MatrixSolveFunctor; + +// TODO(wuweilong): remove these instantiations later +template class MatrixSolveFunctor; +template class MatrixSolveFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/phi/kernels/funcs/matrix_solve.h similarity index 61% rename from paddle/fluid/operators/math/matrix_solve.h rename to paddle/phi/kernels/funcs/matrix_solve.h index 6852d04e5a7e9..3856c06c1b25f 100644 --- a/paddle/fluid/operators/math/matrix_solve.h +++ b/paddle/phi/kernels/funcs/matrix_solve.h @@ -18,18 +18,79 @@ limitations under the License. */ #include "Eigen/Core" #include "Eigen/LU" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace operators { -namespace math { - -template -void compute_solve_eigen(const DeviceContext& context, - const framework::Tensor& a, - const framework::Tensor& b, - framework::Tensor* out) { +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" + +namespace phi { +namespace funcs { + +// for TransposeNormal +static std::vector getNewAxis(const int b_rank) { + std::vector axis_1 = {0}; + std::vector axis_2 = {1, 0}; + std::vector axis_3 = {0, 2, 1}; + std::vector axis_4 = {0, 1, 3, 2}; + std::vector axis_5 = {0, 1, 2, 4, 3}; + std::vector axis_6 = {0, 1, 2, 3, 5, 4}; + std::vector axis_7 = {0, 1, 2, 3, 4, 6, 5}; + std::vector axis_8 = {0, 1, 2, 3, 4, 5, 7, 6}; + std::vector axis_9 = {0, 1, 2, 3, 4, 5, 6, 8, 7}; + switch (b_rank) { + case 1: + return axis_1; + break; + case 2: + return axis_2; + break; + case 3: + return axis_3; + break; + case 4: + return axis_4; + break; + case 5: + return axis_5; + break; + case 6: + return axis_6; + break; + case 7: + return axis_7; + break; + case 8: + return axis_8; + break; + default: + return axis_9; + } +} + +// for Resize +static std::vector getNewDimsVec(const DDim& b_dims) { + std::vector b_dims_vec = phi::vectorize(b_dims); + int size = b_dims_vec.size(); + if (size >= 2) { + // swap the last 2 elements in b_dims_vec + int64_t temp = b_dims_vec[size - 1]; + b_dims_vec[size - 1] = b_dims_vec[size - 2]; + b_dims_vec[size - 2] = temp; + return b_dims_vec; + } + PADDLE_ENFORCE_NE( + b_dims_vec.empty(), + true, + phi::errors::PreconditionNotMet( + "The size of tensor b must not be %d after getting new dims", 0)); + // if b_dims_vec.size() == 1, just retun original vec + return b_dims_vec; +} + +template +void compute_solve_eigen(const Context& context, + const DenseTensor& a, + const DenseTensor& b, + DenseTensor* out) { using Matrix = Eigen::Matrix; using EigenMatrixMap = Eigen::Map; @@ -51,7 +112,7 @@ void compute_solve_eigen(const DeviceContext& context, const T* b_ptr = b.data(); out->Resize(b_mat_dims); // make sure the out dims is right - T* out_ptr = out->mutable_data(context.GetPlace()); + T* out_ptr = context.template Alloc(out); if (a_batch_size == b_batch_size) { for (int i = 0; i < a_batch_size; ++i) { ConstEigenMatrixMap a_mat(a_ptr + i * n * n, n, n); @@ -63,13 +124,13 @@ void compute_solve_eigen(const DeviceContext& context, PADDLE_ENFORCE_GT( min_abs_pivot, static_cast(0), - platform::errors::InvalidArgument("Input is not invertible.")); + phi::errors::InvalidArgument("Input is not invertible.")); out_mat.noalias() = lu.solve(b_mat); } } else { PADDLE_ENFORCE_EQ(a_batch_size, b_batch_size, - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "All input tensors must have the same rank.")); } } @@ -114,22 +175,21 @@ void SolveLinearSystem(T* matrix_data, lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff(); PADDLE_ENFORCE_GT(min_abs_piv, Treal(0), - platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Something's wrong with SolveLinearSystem. ")); output = lu_decomposition.solve(input_rhs); } } -template +template class MatrixSolveFunctor { public: - void operator()(const DeviceContext& context, - const framework::Tensor& a, - const framework::Tensor& b, - framework::Tensor* out); + void operator()(const Context& context, + const DenseTensor& a, + const DenseTensor& b, + DenseTensor* out); }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi From 0a5d625b24e5ab5021f6f75480c376e752a816d7 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Wed, 13 Jul 2022 19:04:19 +0800 Subject: [PATCH 185/250] Opt sparse mask_kernel (#44302) * opt sparse_mask --- .../{sparse_mask_kernel.cc => mask_kernel.cc} | 2 +- .../{sparse_mask_kernel.cu => mask_kernel.cu} | 143 ++++++++++-------- .../{sparse_mask_kernel.h => mask_kernel.h} | 0 .../sparse/sparse_utils_grad_kernel.cc | 1 - .../kernels/sparse/sparse_utils_grad_kernel.h | 2 +- 5 files changed, 84 insertions(+), 64 deletions(-) rename paddle/phi/kernels/sparse/cpu/{sparse_mask_kernel.cc => mask_kernel.cc} (99%) rename paddle/phi/kernels/sparse/gpu/{sparse_mask_kernel.cu => mask_kernel.cu} (72%) rename paddle/phi/kernels/sparse/{sparse_mask_kernel.h => mask_kernel.h} (100%) diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc similarity index 99% rename from paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc rename to paddle/phi/kernels/sparse/cpu/mask_kernel.cc index cf2acd8557333..92c015101264c 100644 --- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/mask_kernel.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" +#include "paddle/phi/kernels/sparse/mask_kernel.h" #include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu similarity index 72% rename from paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu rename to paddle/phi/kernels/sparse/gpu/mask_kernel.cu index 21d6850bdc4aa..39fa89c0379b7 100644 --- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/mask_kernel.cu @@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" - -#include +#include "paddle/phi/kernels/sparse/mask_kernel.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" @@ -24,6 +22,7 @@ limitations under the License. */ #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h" @@ -72,11 +71,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx, phi::backends::gpu::GpuMemcpyAsync(sparse_offsets.data(), &h_sparse_offsets[0], sizeof(int64_t) * sparse_dim, -#ifdef PADDLE_WITH_HIP - hipMemcpyHostToDevice, -#else - cudaMemcpyHostToDevice, -#endif + gpuMemcpyHostToDevice, dev_ctx.stream()); DenseTensor out_indices = phi::EmptyLike(dev_ctx, indices); @@ -93,14 +88,15 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx, auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num * cols, 1); - MaskKernel<<>>( - x_ptr, - indices_ptr, - sparse_offsets.data(), - non_zero_num, - cols, - sparse_dim, - out_values_ptr); + MaskKernel + <<>>( + x_ptr, + indices_ptr, + sparse_offsets.data(), + non_zero_num, + cols, + sparse_dim, + out_values_ptr); out->SetMember(out_indices, out_values, dims, true); } @@ -121,19 +117,31 @@ void SparseMaskKernel(const Context& dev_ctx, })); } -template -__global__ void SparseMaskCopyKernel(const IntT* x_indexs, - const IntT* mask_indexs, - const IntT* bound_out, - const T* x_values, - const int64_t n, - const int64_t stride, - T* out_values) { +template +__global__ void MaskTable(const IntT* x_indexs, const int n, int* table) { + CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { + int index = x_indexs[i]; + table[index] = i == 0 ? -1 : i; + } +} + +template +__global__ void MaskCopy(const IntT* mask_indexs, + const int* table, + const int n, + const int stride, + const T* x_values, + T* out_values) { + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) { - const IntT j = bound_out[i]; - if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) { - for (int k = 0; k < stride; k++) { - out_values[i * stride + k] = x_values[j * stride + k]; + int j = table[mask_indexs[i]]; + if (j != 0) { + if (j == -1) j = 0; + for (int k = 0; k < stride; k += VecSize) { + LoadT vec_x; + phi::Load(x_values + j * stride + k, &vec_x); + phi::Store(vec_x, out_values + i * stride + k); } } } @@ -179,11 +187,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data(), sparse_offsets.data(), sizeof(IntT) * sparse_dim, -#ifdef PADDLE_WITH_HIP - hipMemcpyHostToDevice, -#else - cudaMemcpyHostToDevice, -#endif + gpuMemcpyHostToDevice, dev_ctx.stream()); // 3. flatten x indices and mask indices @@ -210,37 +214,54 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx, mask_indexs.numel(), sparse_dim, mask_indexs_ptr); -// 4. call thrust::lower_bound -#ifdef PADDLE_WITH_HIP - thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()), -#else - thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()), -#endif - x_indexs_ptr, - x_indexs_ptr + x_indexs.numel(), - mask_indexs_ptr, - mask_indexs_ptr + mask_indexs.numel(), - bound_out_ptr); - // 5. copy value to out + int table_size = 1; + auto x_dims = x.dims(); + for (int i = 0; i < x_dims.size() - 1; i++) { + table_size *= x_dims[i]; + } + DenseTensor table = phi::Empty(dev_ctx, {table_size}); + phi::backends::gpu::GpuMemsetAsync( + table.data(), 0, table_size * sizeof(int), dev_ctx.stream()); + const int64_t stride = + x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1]; *out = phi::EmptyLike(dev_ctx, x.non_zero_elements()); phi::funcs::SetConstant set_zero; set_zero(dev_ctx, out, static_cast(0)); T* out_ptr = out->data(); - - const int64_t stride = - x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1]; - - SparseMaskCopyKernel<<>>(x_indexs_ptr, - mask_indexs_ptr, - bound_out_ptr, - x.non_zero_elements().data(), - mask_indexs.numel(), - stride, - out_ptr); + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1); + MaskTable<<>>( + x_indexs_ptr, x_indexs.numel(), table.data()); + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1); + const int VecBytes = 16; + const int VecSize = VecBytes / sizeof(T); + if (stride % VecSize == 0) { + MaskCopy + <<>>(mask_indexs_ptr, + table.data(), + mask_indexs.numel(), + stride, + x.non_zero_elements().data(), + out_ptr); + } else { + MaskCopy<<>>(mask_indexs_ptr, + table.data(), + mask_indexs.numel(), + stride, + x.non_zero_elements().data(), + out_ptr); + } } template @@ -257,7 +278,7 @@ void SparseMaskHelperKernel(const Context& dev_ctx, } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sparse_mask, +PD_REGISTER_KERNEL(mask, GPU, ALL_LAYOUT, phi::sparse::SparseMaskKernel, @@ -272,7 +293,7 @@ PD_REGISTER_KERNEL(sparse_mask, kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO); } -PD_REGISTER_KERNEL(sparse_mask_helper, +PD_REGISTER_KERNEL(mask_helper, GPU, ALL_LAYOUT, phi::sparse::SparseMaskHelperKernel, diff --git a/paddle/phi/kernels/sparse/sparse_mask_kernel.h b/paddle/phi/kernels/sparse/mask_kernel.h similarity index 100% rename from paddle/phi/kernels/sparse/sparse_mask_kernel.h rename to paddle/phi/kernels/sparse/mask_kernel.h diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc index 69677be34b231..9425c14b79b36 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" namespace phi { namespace sparse { diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h index a00b9c275c292..7cf97c3f48ece 100644 --- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h +++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h" +#include "paddle/phi/kernels/sparse/mask_kernel.h" namespace phi { namespace sparse { From 95474815f976c4688393dacfe545207140eb6560 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Wed, 13 Jul 2022 19:13:22 +0800 Subject: [PATCH 186/250] Move eigvals OP to PHI (#44183) * Move eigvals OP to PHI * Fix CI errors * Fix CI errors --- paddle/fluid/operators/eigvals_op.cc | 57 +---- paddle/fluid/operators/eigvals_op.h | 273 ----------------------- paddle/phi/api/yaml/legacy_api.yaml | 8 + paddle/phi/core/utils/data_type.h | 17 ++ paddle/phi/infermeta/unary.cc | 33 +++ paddle/phi/infermeta/unary.h | 4 + paddle/phi/kernels/cpu/eigvals_kernel.cc | 260 +++++++++++++++++++++ paddle/phi/kernels/eigvals_kernel.h | 25 +++ paddle/phi/ops/compat/eigvals_sig.cc | 25 +++ python/paddle/tensor/linalg.py | 4 +- 10 files changed, 383 insertions(+), 323 deletions(-) delete mode 100644 paddle/fluid/operators/eigvals_op.h create mode 100644 paddle/phi/kernels/cpu/eigvals_kernel.cc create mode 100644 paddle/phi/kernels/eigvals_kernel.h create mode 100644 paddle/phi/ops/compat/eigvals_sig.cc diff --git a/paddle/fluid/operators/eigvals_op.cc b/paddle/fluid/operators/eigvals_op.cc index cb81a1a64d1d5..78bd2b37f6959 100644 --- a/paddle/fluid/operators/eigvals_op.cc +++ b/paddle/fluid/operators/eigvals_op.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/eigvals_op.h" - +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -36,59 +37,17 @@ class EigvalsOpMaker : public framework::OpProtoAndCheckerMaker { class EigvalsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigvals"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Eigvals"); - - DDim x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), - 2, - platform::errors::InvalidArgument( - "The dimensions of Input(X) for Eigvals operator " - "should be at least 2, " - "but received X's dimension = %d, X's shape = [%s].", - x_dims.size(), - x_dims)); - - if (ctx->IsRuntime() || !phi::contain_unknown_dim(x_dims)) { - int last_dim = x_dims.size() - 1; - PADDLE_ENFORCE_EQ(x_dims[last_dim], - x_dims[last_dim - 1], - platform::errors::InvalidArgument( - "The last two dimensions of Input(X) for Eigvals " - "operator should be equal, " - "but received X's shape = [%s].", - x_dims)); - } - - auto output_dims = vectorize(x_dims); - output_dims.resize(x_dims.size() - 1); - ctx->SetOutputDim("Out", phi::make_ddim(output_dims)); - } }; -class EigvalsOpVarTypeInference : public framework::VarTypeInference { - public: - void operator()(framework::InferVarTypeContext* ctx) const { - auto input_dtype = ctx->GetInputDataType("X"); - auto output_dtype = framework::IsComplexType(input_dtype) - ? input_dtype - : framework::ToComplexType(input_dtype); - ctx->SetOutputDataType("Out", output_dtype); - } -}; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -namespace plat = paddle::platform; + +DECLARE_INFER_SHAPE_FUNCTOR(eigvals, + EigvalsInferShapeFunctor, + PD_INFER_META(phi::EigvalsInferMeta)); REGISTER_OPERATOR(eigvals, ops::EigvalsOp, ops::EigvalsOpMaker, - ops::EigvalsOpVarTypeInference); -REGISTER_OP_CPU_KERNEL( - eigvals, - ops::EigvalsKernel, - ops::EigvalsKernel, - ops::EigvalsKernel>, - ops::EigvalsKernel>); + EigvalsInferShapeFunctor); diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h deleted file mode 100644 index 38560bf7c35bd..0000000000000 --- a/paddle/fluid/operators/eigvals_op.h +++ /dev/null @@ -1,273 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/memory/allocation/allocator.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/lapack/lapack_function.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; -using DDim = framework::DDim; - -template -struct PaddleComplex; - -template -struct PaddleComplex< - T, - typename std::enable_if::value>::type> { - using type = paddle::platform::complex; -}; -template -struct PaddleComplex< - T, - typename std::enable_if< - std::is_same>::value || - std::is_same>::value>::type> { - using type = T; -}; - -template -using PaddleCType = typename PaddleComplex::type; -template -using Real = typename phi::dtype::Real; - -static void SpiltBatchSquareMatrix(const Tensor& input, - std::vector* output) { - DDim input_dims = input.dims(); - int last_dim = input_dims.size() - 1; - int n_dim = input_dims[last_dim]; - - DDim flattened_input_dims, flattened_output_dims; - if (input_dims.size() > 2) { - flattened_input_dims = - phi::flatten_to_3d(input_dims, last_dim - 1, last_dim); - } else { - flattened_input_dims = phi::make_ddim({1, n_dim, n_dim}); - } - - Tensor flattened_input; - flattened_input.ShareDataWith(input); - flattened_input.Resize(flattened_input_dims); - (*output) = flattened_input.Split(1, 0); -} - -static void CheckLapackEigResult(const int info, const std::string& name) { - PADDLE_ENFORCE_LE(info, - 0, - platform::errors::PreconditionNotMet( - "The QR algorithm failed to compute all the " - "eigenvalues in function %s.", - name.c_str())); - PADDLE_ENFORCE_GE( - info, - 0, - platform::errors::InvalidArgument( - "The %d-th argument has an illegal value in function %s.", - -info, - name.c_str())); -} - -template -static typename std::enable_if::value>::type -LapackEigvals(const framework::ExecutionContext& ctx, - const Tensor& input, - Tensor* output, - Tensor* work, - Tensor* rwork /*unused*/) { - Tensor a; // will be overwritten when lapackEig exit - framework::TensorCopy(input, input.place(), &a); - - Tensor w; - int64_t n_dim = input.dims()[1]; - auto* w_data = - w.mutable_data(phi::make_ddim({n_dim << 1}), ctx.GetPlace()); - - int64_t work_mem = work->memory_size(); - int64_t required_work_mem = 3 * n_dim * sizeof(T); - PADDLE_ENFORCE_GE( - work_mem, - 3 * n_dim * sizeof(T), - platform::errors::InvalidArgument( - "The memory size of the work tensor in LapackEigvals function " - "should be at least %" PRId64 " bytes, " - "but received work\'s memory size = %" PRId64 " bytes.", - required_work_mem, - work_mem)); - - int info = 0; - phi::funcs::lapackEig('N', - 'N', - static_cast(n_dim), - a.template data(), - static_cast(n_dim), - w_data, - NULL, - 1, - NULL, - 1, - work->template data(), - static_cast(work_mem / sizeof(T)), - static_cast(NULL), - &info); - - std::string name = "framework::platform::dynload::dgeev_"; - if (framework::TransToProtoVarType(input.dtype()) == - framework::proto::VarType::FP64) { - name = "framework::platform::dynload::sgeev_"; - } - CheckLapackEigResult(info, name); - - platform::ForRange for_range( - ctx.template device_context(), n_dim); - phi::funcs::RealImagToComplexFunctor> functor( - w_data, w_data + n_dim, output->template data>(), n_dim); - for_range(functor); -} - -template -typename std::enable_if>::value || - std::is_same>::value>::type -LapackEigvals(const framework::ExecutionContext& ctx, - const Tensor& input, - Tensor* output, - Tensor* work, - Tensor* rwork) { - Tensor a; // will be overwritten when lapackEig exit - framework::TensorCopy(input, input.place(), &a); - - int64_t work_mem = work->memory_size(); - int64_t n_dim = input.dims()[1]; - int64_t required_work_mem = 3 * n_dim * sizeof(T); - PADDLE_ENFORCE_GE( - work_mem, - 3 * n_dim * sizeof(T), - platform::errors::InvalidArgument( - "The memory size of the work tensor in LapackEigvals function " - "should be at least %" PRId64 " bytes, " - "but received work\'s memory size = %" PRId64 " bytes.", - required_work_mem, - work_mem)); - - int64_t rwork_mem = rwork->memory_size(); - int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::dtype::Real); - PADDLE_ENFORCE_GE( - rwork_mem, - required_rwork_mem, - platform::errors::InvalidArgument( - "The memory size of the rwork tensor in LapackEigvals function " - "should be at least %" PRId64 " bytes, " - "but received rwork\'s memory size = %" PRId64 " bytes.", - required_rwork_mem, - rwork_mem)); - - int info = 0; - phi::funcs::lapackEig>( - 'N', - 'N', - static_cast(n_dim), - a.template data(), - static_cast(n_dim), - output->template data(), - NULL, - 1, - NULL, - 1, - work->template data(), - static_cast(work_mem / sizeof(T)), - rwork->template data>(), - &info); - - std::string name = "framework::platform::dynload::cgeev_"; - if (framework::TransToProtoVarType(input.dtype()) == - framework::proto::VarType::COMPLEX64) { - name = "framework::platform::dynload::zgeev_"; - } - CheckLapackEigResult(info, name); -} - -template -class EigvalsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* input = ctx.Input("X"); - Tensor* output = ctx.Output("Out"); - output->mutable_data>(ctx.GetPlace()); - - std::vector input_matrices; - SpiltBatchSquareMatrix(*input, /*->*/ &input_matrices); - - int64_t n_dim = input_matrices[0].dims()[1]; - int64_t n_batch = input_matrices.size(); - DDim output_dims = output->dims(); - output->Resize(phi::make_ddim({n_batch, n_dim})); - std::vector output_vectors = output->Split(1, 0); - - // query workspace size - T qwork; - int info; - phi::funcs::lapackEig>( - 'N', - 'N', - static_cast(n_dim), - input_matrices[0].template data(), - static_cast(n_dim), - NULL, - NULL, - 1, - NULL, - 1, - &qwork, - -1, - static_cast*>(NULL), - &info); - int64_t lwork = static_cast(qwork); - - Tensor work, rwork; - try { - work.mutable_data(phi::make_ddim({lwork}), ctx.GetPlace()); - } catch (memory::allocation::BadAlloc&) { - LOG(WARNING) << "Failed to allocate Lapack workspace with the optimal " - << "memory size = " << lwork * sizeof(T) << " bytes, " - << "try reallocating a smaller workspace with the minimum " - << "required size = " << 3 * n_dim * sizeof(T) << " bytes, " - << "this may lead to bad performance."; - lwork = 3 * n_dim; - work.mutable_data(phi::make_ddim({lwork}), ctx.GetPlace()); - } - if (framework::IsComplexType( - framework::TransToProtoVarType(input->dtype()))) { - rwork.mutable_data>(phi::make_ddim({n_dim << 1}), - ctx.GetPlace()); - } - - for (int64_t i = 0; i < n_batch; ++i) { - LapackEigvals( - ctx, input_matrices[i], &output_vectors[i], &work, &rwork); - } - output->Resize(output_dims); - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index cd01c23641010..3dad0b96ae758 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -536,6 +536,14 @@ func : eigh backward : eigh_grad +- api : eigvals + args : (Tensor x) + output : Tensor + infer_meta : + func : EigvalsInferMeta + kernel : + func : eigvals + - api : einsum args : (Tensor[] x, str equation) output : Tensor, Tensor[]{x.size()}, Tensor[]{x.size()} diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index 9ef8e8a356c7a..975d55889c717 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -80,4 +80,21 @@ inline void VisitDataTypeTiny(phi::DataType type, Visitor visitor) { "Not supported phi::DataType(%d) as data type.", static_cast(type))); } +inline bool IsComplexType(const DataType& type) { + return (type == DataType::COMPLEX64 || type == DataType::COMPLEX128); +} + +inline DataType ToComplexType(const DataType& type) { + switch (type) { + case DataType::FLOAT32: + return DataType::COMPLEX64; + case DataType::FLOAT64: + return DataType::COMPLEX128; + default: + PADDLE_THROW(errors::Unimplemented( + "Can not transform data type (%s) to complex type, now only support " + "float32 and float64 real value.", + type)); + } +} } // namespace phi diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 0048f130adf62..f6e3b0d72474a 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -399,6 +399,39 @@ void EighInferMeta(const MetaTensor& x, out_v->set_dims(input_dim); } +void EigvalsInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config) { + auto x_dims = x.dims(); + PADDLE_ENFORCE_GE(x_dims.size(), + 2, + errors::InvalidArgument( + "The dimensions of Input(X) for Eigvals operator " + "should be at least 2, " + "but received X's dimension = %d, X's shape = [%s].", + x_dims.size(), + x_dims)); + + if (config.is_runtime || !phi::contain_unknown_dim(x_dims)) { + int last_dim = x_dims.size() - 1; + PADDLE_ENFORCE_EQ(x_dims[last_dim], + x_dims[last_dim - 1], + errors::InvalidArgument( + "The last two dimensions of Input(X) for Eigvals " + "operator should be equal, " + "but received X's shape = [%s].", + x_dims)); + } + + auto out_dims = vectorize(x_dims); + out_dims.resize(x_dims.size() - 1); + + const DataType& x_dtype = x.dtype(); + const DataType& out_dtype = + IsComplexType(x_dtype) ? x_dtype : ToComplexType(x_dtype); + + out->set_dims(make_ddim(out_dims)); + out->set_dtype(out_dtype); +} + void EinsumInferMeta(const std::vector& inputs, const std::string& equation, MetaTensor* out, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 0b9298cfd362f..fc36e1d4f85b6 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -80,6 +80,10 @@ void EighInferMeta(const MetaTensor& x, MetaTensor* out_w, MetaTensor* out_v); +void EigvalsInferMeta(const MetaTensor& x, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void EinsumInferMeta(const std::vector& inputs, const std::string& equation, MetaTensor* out, diff --git a/paddle/phi/kernels/cpu/eigvals_kernel.cc b/paddle/phi/kernels/cpu/eigvals_kernel.cc new file mode 100644 index 0000000000000..e99aa42fbdb29 --- /dev/null +++ b/paddle/phi/kernels/cpu/eigvals_kernel.cc @@ -0,0 +1,260 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/eigvals_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/lapack/lapack_function.h" + +namespace phi { + +template +struct PaddleComplex; + +template +struct PaddleComplex< + T, + typename std::enable_if::value>::type> { + using type = dtype::complex; +}; + +template +struct PaddleComplex< + T, + typename std::enable_if< + std::is_same>::value || + std::is_same>::value>::type> { + using type = T; +}; + +template +using PaddleCType = typename PaddleComplex::type; +template +using Real = typename dtype::Real; + +inline void CheckLapackEigResult(const int info, const std::string& name) { + PADDLE_ENFORCE_LE( + info, + 0, + errors::PreconditionNotMet("The QR algorithm failed to compute all the " + "eigenvalues in function %s.", + name.c_str())); + PADDLE_ENFORCE_GE( + info, + 0, + errors::InvalidArgument( + "The %d-th argument has an illegal value in function %s.", + -info, + name.c_str())); +} + +template +typename std::enable_if::value>::type LapackEigvals( + const Context& ctx, + const DenseTensor& input, + DenseTensor* output, + DenseTensor* work, + DenseTensor* rwork /*unused*/) { + DenseTensor a; // will be overwritten when lapackEig exit + Copy(ctx, input, input.place(), /*blocking=*/true, &a); + + DenseTensor w; + int64_t n_dim = input.dims()[1]; + w.Resize(make_ddim({n_dim << 1})); + T* w_data = ctx.template Alloc(&w); + + int64_t work_mem = work->memory_size(); + int64_t required_work_mem = 3 * n_dim * sizeof(T); + PADDLE_ENFORCE_GE( + work_mem, + 3 * n_dim * sizeof(T), + errors::InvalidArgument( + "The memory size of the work tensor in LapackEigvals function " + "should be at least %" PRId64 " bytes, " + "but received work\'s memory size = %" PRId64 " bytes.", + required_work_mem, + work_mem)); + + int info = 0; + phi::funcs::lapackEig('N', + 'N', + static_cast(n_dim), + a.template data(), + static_cast(n_dim), + w_data, + NULL, + 1, + NULL, + 1, + work->template data(), + static_cast(work_mem / sizeof(T)), + static_cast(NULL), + &info); + + std::string name = "phi::backend::dynload::dgeev_"; + if (input.dtype() == DataType::FLOAT64) { + name = "phi::backend::dynload::sgeev_"; + } + CheckLapackEigResult(info, name); + + funcs::ForRange for_range(ctx, n_dim); + funcs::RealImagToComplexFunctor> functor( + w_data, w_data + n_dim, output->template data>(), n_dim); + for_range(functor); +} + +template +typename std::enable_if>::value || + std::is_same>::value>::type +LapackEigvals(const Context& ctx, + const DenseTensor& input, + DenseTensor* output, + DenseTensor* work, + DenseTensor* rwork) { + DenseTensor a; // will be overwritten when lapackEig exit + Copy(ctx, input, input.place(), /*blocking=*/true, &a); + + int64_t work_mem = work->memory_size(); + int64_t n_dim = input.dims()[1]; + int64_t required_work_mem = 3 * n_dim * sizeof(T); + PADDLE_ENFORCE_GE( + work_mem, + 3 * n_dim * sizeof(T), + errors::InvalidArgument( + "The memory size of the work tensor in LapackEigvals function " + "should be at least %" PRId64 " bytes, " + "but received work\'s memory size = %" PRId64 " bytes.", + required_work_mem, + work_mem)); + + int64_t rwork_mem = rwork->memory_size(); + int64_t required_rwork_mem = (n_dim << 1) * sizeof(dtype::Real); + PADDLE_ENFORCE_GE( + rwork_mem, + required_rwork_mem, + errors::InvalidArgument( + "The memory size of the rwork tensor in LapackEigvals function " + "should be at least %" PRId64 " bytes, " + "but received rwork\'s memory size = %" PRId64 " bytes.", + required_rwork_mem, + rwork_mem)); + + int info = 0; + phi::funcs::lapackEig>( + 'N', + 'N', + static_cast(n_dim), + a.template data(), + static_cast(n_dim), + output->template data(), + NULL, + 1, + NULL, + 1, + work->template data(), + static_cast(work_mem / sizeof(T)), + rwork->template data>(), + &info); + + std::string name = "phi::backend::dynload::cgeev_"; + if (input.dtype() == DataType::COMPLEX128) { + name = "phi::backend::dynload::zgeev_"; + } + CheckLapackEigResult(info, name); +} + +void SpiltBatchSquareMatrix(const DenseTensor& input, + std::vector* output) { + DDim input_dims = input.dims(); + int last_dim = input_dims.size() - 1; + int n_dim = input_dims[last_dim]; + + DDim flattened_input_dims, flattened_output_dims; + if (input_dims.size() > 2) { + flattened_input_dims = + phi::flatten_to_3d(input_dims, last_dim - 1, last_dim); + } else { + flattened_input_dims = phi::make_ddim({1, n_dim, n_dim}); + } + + DenseTensor flattened_input; + flattened_input.ShareDataWith(input); + flattened_input.Resize(flattened_input_dims); + (*output) = flattened_input.Split(1, 0); +} + +template +void EigvalsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + ctx.template Alloc>(out); + + std::vector x_matrices; + SpiltBatchSquareMatrix(x, /*->*/ &x_matrices); + + int64_t n_dim = x_matrices[0].dims()[1]; + int64_t n_batch = x_matrices.size(); + DDim out_dims = out->dims(); + out->Resize(make_ddim({n_batch, n_dim})); + std::vector out_vectors = out->Split(1, 0); + + // query workspace size + T qwork; + int info; + funcs::lapackEig>('N', + 'N', + static_cast(n_dim), + x_matrices[0].template data(), + static_cast(n_dim), + NULL, + NULL, + 1, + NULL, + 1, + &qwork, + -1, + static_cast*>(NULL), + &info); + int64_t lwork = static_cast(qwork); + + DenseTensor work, rwork; + + work.Resize(make_ddim({lwork})); + ctx.template Alloc(&work); + + if (IsComplexType(x.dtype())) { + rwork.Resize(make_ddim({n_dim << 1})); + ctx.template Alloc>(&rwork); + } + + for (int64_t i = 0; i < n_batch; ++i) { + LapackEigvals( + ctx, x_matrices[i], &out_vectors[i], &work, &rwork); + } + out->Resize(out_dims); +} + +} // namespace phi + +PD_REGISTER_KERNEL(eigvals, + CPU, + ALL_LAYOUT, + phi::EigvalsKernel, + float, + double, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/eigvals_kernel.h b/paddle/phi/kernels/eigvals_kernel.h new file mode 100644 index 0000000000000..dd9f3370bd08e --- /dev/null +++ b/paddle/phi/kernels/eigvals_kernel.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void EigvalsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/eigvals_sig.cc b/paddle/phi/ops/compat/eigvals_sig.cc new file mode 100644 index 0000000000000..cb29126abc39f --- /dev/null +++ b/paddle/phi/ops/compat/eigvals_sig.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature EigvalsOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("eigvals", {"X"}, {}, {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(eigvals, phi::EigvalsOpArgumentMapping); diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 95eaee2cc0356..1bc85a076a0f7 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -2339,7 +2339,9 @@ def eigvals(x, name=None): "The last two dimensions of Input(x) should be equal, but received x's shape = {}" .format(x_shape)) - if paddle.in_dynamic_mode(): + if in_dygraph_mode(): + return _C_ops.final_state_eigvals(x) + elif paddle.in_dynamic_mode(): return _C_ops.eigvals(x) helper = LayerHelper('eigvals', **locals()) From 5a312fb9f6c51e83caa4711b957bf1ad00aa3099 Mon Sep 17 00:00:00 2001 From: moyan <79295425+momozi1996@users.noreply.github.com> Date: Wed, 13 Jul 2022 19:20:34 +0800 Subject: [PATCH 187/250] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E8=B4=A6=E6=88=B7?= =?UTF-8?q?=E5=8F=8Aid=20MoYan=20=E8=87=B3=20api=20approval=20=E6=A3=80?= =?UTF-8?q?=E6=9F=A5=E5=88=97=E8=A1=A8=20(#44306)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * MoYan;test=document_fix * MoYan;test=document_fix --- tools/check_api_approvals.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index 18b467ccf4781..815201469e89a 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -43,22 +43,22 @@ api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/flu if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then echo_line="You must have one RD (XiaoguangHu01, lanxianghit or Superjomn) approval for API change.\n" echo_line="${echo_line} and one TPM approval for API change: \n" - echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general APIs.\n" + echo_line="${echo_line} jzhang533/ZhangJun, momozi1996/MoYan, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general APIs.\n" echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related APIs.\n" echo_line="${echo_line} leiqing1/LeiQing for inference related APIs.\n" check_approval 1 46782768 47554610 328693 - check_approval 1 29231 23093488 11935832 39876205 65896652 54695910 + check_approval 1 29231 79295425 23093488 11935832 39876205 65896652 54695910 fi api_doc_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.doc ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.doc` if [ "$api_doc_spec_diff" != "" ]; then echo_line="You must have one TPM approval for API documents change: \n" - echo_line="${echo_line} jzhang533/ZhangJun, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general API docs.\n" + echo_line="${echo_line} jzhang533/ZhangJun, momozi1996/MoYan, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general API docs.\n" echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related API docs.\n" echo_line="${echo_line} leiqing1/LeiQing for inference related API docs.\n" - check_approval 1 29231 23093488 11935832 39876205 65896652 54695910 + check_approval 1 29231 79295425 23093488 11935832 39876205 65896652 54695910 fi api_src_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5 ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` From cb4eea9277a1abfcb95df350b91ad4aa18578c1a Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 13 Jul 2022 19:20:58 +0800 Subject: [PATCH 188/250] fix convert error. (#44307) --- .../passes/convert_to_mixed_precision.cc | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index bc753636d2c1a..48d2cefe4a720 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -119,6 +119,15 @@ bool WeightsShouldNotConvert(ir::Node* var_node) { return false; } +inline bool IsFloatVarType(framework::proto::VarType::Type type) { + if (type == framework::proto::VarType::FP16 || + type == framework::proto::VarType::FP32 || + type == framework::proto::VarType::BF16 || + type == framework::proto::VarType::FP64) + return true; + return false; +} + void ConvertTensorDtype(framework::ir::Graph* graph, const std::unordered_set& blacklist, bool keep_io_types, @@ -146,8 +155,6 @@ void ConvertTensorDtype(framework::ir::Graph* graph, if (!op_node->IsOp()) continue; auto op_type = op_node->Op()->Type(); auto phi_op_type = phi::TransToPhiKernelName(op_type); - // LOG(INFO) << "process op " << op_type << ", corresponding phi type is " - // << phi_op_type; // 1. set input dtype. if (op_type == "feed") { block_desc = op_node->Op()->Block(); @@ -175,12 +182,14 @@ void ConvertTensorDtype(framework::ir::Graph* graph, ++num_low_precision; auto inputs = op_node->inputs; for (auto* in_node : inputs) { + if (in_node->IsCtrlVar()) continue; auto* in_var = in_node->Var(); if (in_var->Persistable() && in_var->GetDataType() == framework::proto::VarType::FP32) { if (WeightsShouldNotConvert(in_node)) continue; in_var->SetDataType(to_type); } else if (!in_var->Persistable() && + IsFloatVarType(in_var->GetDataType()) && in_var->GetDataType() != to_type) { AddCastOp(graph, in_node, @@ -193,6 +202,7 @@ void ConvertTensorDtype(framework::ir::Graph* graph, } } for (auto* out_node : op_node->outputs) { + if (out_node->IsCtrlVar()) continue; auto* out_var = out_node->Var(); if (out_var->GetDataType() == framework::proto::VarType::FP32) { if (OutShouldNotConvert(out_node)) continue; @@ -202,8 +212,9 @@ void ConvertTensorDtype(framework::ir::Graph* graph, } else { auto inputs = op_node->inputs; for (auto* in_node : inputs) { + if (in_node->IsCtrlVar()) continue; auto* in_var = in_node->Var(); - if (!in_var->Persistable() && + if (!in_var->Persistable() && IsFloatVarType(in_var->GetDataType()) && in_var->GetDataType() != framework::proto::VarType::FP32) { AddCastOp(graph, in_node, @@ -224,6 +235,7 @@ void ConvertTensorDtype(framework::ir::Graph* graph, // trt pass should explicitle add cast op is input is bf16/tf32, etc. if (op_node->Name() == "tensorrt_engine") continue; for (auto* in_node : op_node->inputs) { + if (in_node->IsCtrlVar()) continue; auto* in_var = in_node->Var(); if (in_var->GetDataType() == to_type) { AddCastOp(graph, @@ -242,6 +254,7 @@ void ConvertTensorDtype(framework::ir::Graph* graph, // 4. if output_op's dtype is not compatible to output dtype, then just insert // cast. for (auto* node : output_nodes) { + if (node->IsCtrlVar()) continue; auto var = node->Var(); if (keep_io_types && var->GetDataType() == to_type) { // fp16/bf16 -> fp32. @@ -381,7 +394,7 @@ void ConvertToMixedPrecision(const std::string& model_file, std::unordered_set weights_should_be_fp32; for (auto* node : graph->Nodes()) { - if (!node->IsVar()) continue; + if (!(node->IsVar() && !node->IsCtrlVar())) continue; if (node->Var()->GetType() == paddle::framework::proto::VarType::SELECTED_ROWS || node->Var()->GetType() == From 917235be3cdb1893c077eb90482187d8cdaad7ec Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Wed, 13 Jul 2022 19:44:32 +0800 Subject: [PATCH 189/250] add ResNetBasicBlock python api for kunlun, test=kunlun (#44171) --- .../fused/resnet_basic_block_op_xpu.cc | 12 +- .../fluid/platform/device/xpu/xpu2_op_list.h | 6 +- .../test_fused_resnet_basic_block_op_xpu.py | 272 ++++++++++ python/paddle/incubate/__init__.py | 1 + python/paddle/incubate/xpu/__init__.py | 15 + python/paddle/incubate/xpu/resnet_block.py | 468 ++++++++++++++++++ python/setup.py.in | 1 + 7 files changed, 763 insertions(+), 12 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py create mode 100644 python/paddle/incubate/xpu/__init__.py create mode 100644 python/paddle/incubate/xpu/resnet_block.py diff --git a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc index c7a6620c75f8e..52e6807f15c67 100644 --- a/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc +++ b/paddle/fluid/operators/fused/resnet_basic_block_op_xpu.cc @@ -959,12 +959,8 @@ class ResNetBasicBlockGradXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_XPU_KERNEL( - resnet_basic_block, - ops::ResNetBasicBlockXPUKernel, - ops::ResNetBasicBlockXPUKernel); -REGISTER_OP_XPU_KERNEL( - resnet_basic_block_grad, - ops::ResNetBasicBlockGradXPUKernel, - ops::ResNetBasicBlockGradXPUKernel); +REGISTER_OP_XPU_KERNEL(resnet_basic_block, + ops::ResNetBasicBlockXPUKernel); +REGISTER_OP_XPU_KERNEL(resnet_basic_block_grad, + ops::ResNetBasicBlockGradXPUKernel); #endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index e7570de695f28..4a6f07b76ba57 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -519,11 +519,9 @@ XPUOpMap& get_kl2_ops() { // Fused op {"resnet_basic_block_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"resnet_basic_block", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), - pOpKernelType(vartype::FP16, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu2_kernels; diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py new file mode 100644 index 0000000000000..8fe9769d51925 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py @@ -0,0 +1,272 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys + +sys.path.append("..") +import unittest +import numpy as np +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.nn as nn +from paddle.fluid import core +from paddle.incubate.xpu.resnet_block import ResNetBasicBlock +from paddle.fluid.framework import default_main_program +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +class XPUTestResNetBasicBlockOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = "resnet_basic_block" + self.use_dynamic_create_class = False + + class TestResNetBasicBlockOp(OpTest): + + def setUp(self): + paddle.disable_static() + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.__class__.op_type = "resnet_basic_block" + self.__class__.no_need_check_grad = True + self.getShape() + self.getDiff() + self.getShortcut() + paddle.set_default_dtype(self.dtype) + + self.src = np.random.random(self.input_size).astype(self.dtype) + self.dout = np.random.random(self.output_size).astype(self.dtype) + + def getShape(self): + self.in_channels = 8 + self.out_channels = 8 + self.stride = 1 + self.input_size = [2, 8, 32, 32] # NCHW + self.output_size = [2, 8, 32, 32] # NCHW + + def getDiff(self): + self.rtol = 1e-3 + self.atol = 1e-3 + + def getShortcut(self): + self.has_shortcut = False + + def Base(self): + paddle.disable_static() + + conv1_weight = fluid.ParamAttr( + initializer=fluid.initializer.Xavier(uniform=False), + learning_rate=0.001) + conv2_weight = fluid.ParamAttr( + initializer=fluid.initializer.Xavier(uniform=False), + learning_rate=0.001) + conv3_weight = fluid.ParamAttr( + initializer=fluid.initializer.Xavier(uniform=False), + learning_rate=0.001) + bn1_weight = fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=1.0)) + bn1_bias = fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.0)) + bn2_weight = fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=1.0)) + bn2_bias = fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.0)) + bn3_weight = fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=1.0)) + bn3_bias = fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.0)) + + self.conv1 = nn.Conv2D(in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=3, + stride=self.stride, + padding=1, + weight_attr=conv1_weight, + bias_attr=None, + data_format='NCHW') + self.bn1 = nn.BatchNorm(self.out_channels, + act='relu', + param_attr=bn1_weight, + bias_attr=bn1_bias, + data_layout='NCHW') + self.conv2 = nn.Conv2D(in_channels=self.out_channels, + out_channels=self.out_channels, + kernel_size=3, + stride=1, + padding=1, + weight_attr=conv2_weight, + bias_attr=None, + data_format='NCHW') + self.bn2 = nn.BatchNorm(self.out_channels, + act=None, + param_attr=bn2_weight, + bias_attr=bn2_bias, + data_layout='NCHW') + self.conv3 = nn.Conv2D(in_channels=self.in_channels, + out_channels=self.out_channels, + kernel_size=1, + stride=self.stride, + padding=0, + weight_attr=conv3_weight, + bias_attr=None, + data_format='NCHW') + self.bn3 = nn.BatchNorm(self.out_channels, + act=None, + param_attr=bn3_weight, + bias_attr=bn3_bias, + data_layout='NCHW') + self.relu = nn.ReLU() + + tensor_src = paddle.to_tensor(self.src, stop_gradient=False) + if self.has_shortcut: + z_out = self.bn3(self.conv3(tensor_src)) + else: + z_out = tensor_src + bn1_out = self.bn1(self.conv1(tensor_src)) + bn2_out = self.bn2(self.conv2(bn1_out)) + result = self.relu(bn2_out + z_out) + paddle.autograd.backward([result], [paddle.to_tensor(self.dout)], + True) + return result, tensor_src.grad + + def FusedResNetBasicBlock(self): + paddle.disable_static() + + fused_conv1_weight = fluid.ParamAttr( + initializer=fluid.initializer.Xavier(uniform=False), + learning_rate=0.001) + fused_conv2_weight = fluid.ParamAttr( + initializer=fluid.initializer.Xavier(uniform=False), + learning_rate=0.001) + fused_conv3_weight = fluid.ParamAttr( + initializer=fluid.initializer.Xavier(uniform=False), + learning_rate=0.001) + fused_bn1_weight = fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0)) + fused_bn1_bias = fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.0)) + fused_bn2_weight = fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0)) + fused_bn2_bias = fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.0)) + fused_bn3_weight = fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0)) + fused_bn3_bias = fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.0)) + + if self.has_shortcut: + self.resnet_basic_block = ResNetBasicBlock( + num_channels1=self.in_channels, + num_filter1=self.out_channels, + filter1_size=3, + num_channels2=self.out_channels, + num_filter2=self.out_channels, + filter2_size=3, + num_channels3=self.in_channels, + num_filter3=self.out_channels, + filter3_size=1, + filter1_attr=fused_conv1_weight, + scale1_attr=fused_bn1_weight, + bias1_attr=fused_bn1_bias, + filter2_attr=fused_conv2_weight, + scale2_attr=fused_bn2_weight, + bias2_attr=fused_bn2_bias, + filter3_attr=fused_conv3_weight, + scale3_attr=fused_bn3_weight, + bias3_attr=fused_bn3_bias, + stride1=self.stride, + stride2=1, + stride3=self.stride, + act='relu', + padding1=1, + padding2=1, + padding3=0, + has_shortcut=True) + else: + self.resnet_basic_block = ResNetBasicBlock( + num_channels1=self.in_channels, + num_filter1=self.out_channels, + filter1_size=3, + num_channels2=self.out_channels, + num_filter2=self.out_channels, + filter2_size=3, + num_channels3=self.in_channels, + num_filter3=self.out_channels, + filter3_size=1, + filter1_attr=fused_conv1_weight, + scale1_attr=fused_bn1_weight, + bias1_attr=fused_bn1_bias, + filter2_attr=fused_conv2_weight, + scale2_attr=fused_bn2_weight, + bias2_attr=fused_bn2_bias, + filter3_attr=fused_conv3_weight, + scale3_attr=fused_bn3_weight, + bias3_attr=fused_bn3_bias, + stride1=self.stride, + stride2=1, + stride3=self.stride, + act='relu', + padding1=1, + padding2=1, + padding3=1, + has_shortcut=False) + + x = paddle.to_tensor(self.src, stop_gradient=False) + out = self.resnet_basic_block.forward(x) + paddle.autograd.backward([out], [paddle.to_tensor(self.dout)]) + return out, x.grad + + def test_out_and_grad_has_shortcut(self): + self.has_shortcut = True + default_main_program().random_seed = 1 + base_out, base_grad = self.Base() + fused_out, fused_grad = self.FusedResNetBasicBlock() + np.testing.assert_allclose(base_out.numpy(), + fused_out.numpy(), + rtol=self.rtol, + atol=self.atol) + np.testing.assert_allclose(base_grad.numpy(), + fused_grad.numpy(), + rtol=self.rtol, + atol=self.atol) + + def test_out_and_grad(self): + self.has_shortcut = False + default_main_program().random_seed = 1 + base_out, base_grad = self.Base() + fused_out, fused_grad = self.FusedResNetBasicBlock() + np.testing.assert_allclose(base_out.numpy(), + fused_out.numpy(), + rtol=self.rtol, + atol=self.atol) + np.testing.assert_allclose(base_grad.numpy(), + fused_grad.numpy(), + rtol=self.rtol, + atol=self.atol) + + +support_types = get_xpu_op_support_types('resnet_basic_block') +for stype in support_types: + create_test_class(globals(), + XPUTestResNetBasicBlockOp, + stype, + ignore_deivce_version=[core.XPUVersion.XPU1]) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py index 8a28b65f696b3..543b0b815c16e 100644 --- a/python/paddle/incubate/__init__.py +++ b/python/paddle/incubate/__init__.py @@ -38,6 +38,7 @@ from ..fluid.layers.loss import identity_loss from ..fluid.incubate import fleet +from . import xpu __all__ = [ 'LookAhead', diff --git a/python/paddle/incubate/xpu/__init__.py b/python/paddle/incubate/xpu/__init__.py new file mode 100644 index 0000000000000..33a93b00f51da --- /dev/null +++ b/python/paddle/incubate/xpu/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .resnet_block import ResNetBasicBlock diff --git a/python/paddle/incubate/xpu/resnet_block.py b/python/paddle/incubate/xpu/resnet_block.py new file mode 100644 index 0000000000000..2b690cd7bf929 --- /dev/null +++ b/python/paddle/incubate/xpu/resnet_block.py @@ -0,0 +1,468 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import collections +import itertools +import six +import math +import sys +import warnings +from functools import partial, reduce + +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle import framework +from paddle.nn import initializer as I +from paddle.nn import Layer, LayerList +from paddle.fluid.layers import utils +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.data_feeder import convert_dtype +from paddle.fluid.param_attr import ParamAttr +from paddle import _C_ops + +__all__ = ['resnet_basic_block', 'ResNetBasicBlock'] + + +def resnet_basic_block(x, + filter1, + scale1, + bias1, + mean1, + var1, + filter2, + scale2, + bias2, + mean2, + var2, + filter3, + scale3, + bias3, + mean3, + var3, + stride1, + stride2, + stride3, + padding1, + padding2, + padding3, + dilation1, + dilation2, + dilation3, + groups, + momentum, + eps, + data_format, + has_shortcut, + use_global_stats=None, + training=False, + trainable_statistics=False, + find_conv_max=True): + + if fluid.framework.in_dygraph_mode(): + attrs = ('stride1', stride1, 'stride2', stride2, 'stride3', stride3, + 'padding1', padding1, 'padding2', padding2, 'padding3', + padding3, 'dilation1', dilation1, 'dilation2', dilation2, + 'dilation3', dilation3, 'group', groups, 'momentum', momentum, + 'epsilon', eps, 'data_format', data_format, 'has_shortcut', + has_shortcut, 'use_global_stats', use_global_stats, + "trainable_statistics", trainable_statistics, 'is_test', + not training, 'act_type', "relu", 'find_conv_input_max', + find_conv_max) + + out, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ = \ + getattr(_C_ops, "resnet_basic_block")(x, filter1, scale1, bias1, mean1, var1, filter2, scale2, bias2, mean2, var2, \ + filter3, scale3, bias3, mean3, var3, mean1, var1, mean2, var2, mean3, var3, *attrs) + return out + helper = LayerHelper('resnet_basic_block', **locals()) + bn_param_dtype = fluid.core.VarDesc.VarType.FP32 + max_dtype = fluid.core.VarDesc.VarType.FP32 + + out = helper.create_variable_for_type_inference(dtype=x.dtype, + stop_gradient=True) + conv1 = helper.create_variable_for_type_inference(dtype=x.dtype, + stop_gradient=True) + saved_mean1 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + saved_invstd1 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + running_mean1 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) if mean1 is None else mean1 + running_var1 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) if var1 is None else var1 + conv2 = helper.create_variable_for_type_inference(dtype=x.dtype, + stop_gradient=True) + conv2_input = helper.create_variable_for_type_inference(dtype=x.dtype, + stop_gradient=True) + saved_mean2 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + saved_invstd2 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + running_mean2 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) if mean2 is None else mean2 + running_var2 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) if var2 is None else var2 + conv3 = helper.create_variable_for_type_inference(dtype=x.dtype, + stop_gradient=True) + saved_mean3 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + saved_invstd3 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) + running_mean3 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) if mean3 is None else mean3 + running_var3 = helper.create_variable_for_type_inference( + dtype=bn_param_dtype, stop_gradient=True) if var3 is None else var3 + conv1_input_max = helper.create_variable_for_type_inference( + dtype=max_dtype, stop_gradient=True) + conv1_filter_max = helper.create_variable_for_type_inference( + dtype=max_dtype, stop_gradient=True) + conv2_input_max = helper.create_variable_for_type_inference( + dtype=max_dtype, stop_gradient=True) + conv2_filter_max = helper.create_variable_for_type_inference( + dtype=max_dtype, stop_gradient=True) + conv3_input_max = helper.create_variable_for_type_inference( + dtype=max_dtype, stop_gradient=True) + conv3_filter_max = helper.create_variable_for_type_inference( + dtype=max_dtype, stop_gradient=True) + + inputs = { + 'X': x, + 'Filter1': filter1, + 'Scale1': scale1, + 'Bias1': bias1, + 'Mean1': mean1, + 'Var1': var1, + 'Filter2': filter2, + 'Scale2': scale2, + 'Bias2': bias2, + 'Mean2': mean2, + 'Var2': var2, + 'Filter3': filter3, + 'Scale3': scale3, + 'Bias3': bias3, + 'Mean3': mean3, + 'Var3': var3, + } + + attrs = { + 'stride1': stride1, + 'stride2': stride2, + 'stride3': stride3, + 'padding1': padding1, + 'padding2': padding2, + 'padding3': padding3, + 'dilation1': dilation1, + 'dilation2': dilation2, + 'dilation3': dilation3, + 'group': groups, + 'momentum': momentum, + 'epsilon': eps, + 'data_format': data_format, + 'has_shortcut': has_shortcut, + 'use_global_stats': use_global_stats, + "trainable_statistics": trainable_statistics, + 'is_test': not training, + 'act_type': "relu", + 'find_conv_input_max': find_conv_max + } + + outputs = { + 'Y': out, + 'Conv1': conv1, + 'SavedMean1': saved_mean1, + 'SavedInvstd1': saved_invstd1, + 'Mean1Out': running_mean1, + 'Var1Out': running_var1, + 'Conv2': conv2, + 'SavedMean2': saved_mean2, + 'SavedInvstd2': saved_invstd2, + 'Mean2Out': running_mean2, + 'Var2Out': running_var2, + 'Conv2Input': conv2_input, + 'Conv3': conv3, + 'SavedMean3': saved_mean3, + 'SavedInvstd3': saved_invstd3, + 'Mean3Out': running_mean3, + 'Var3Out': running_var3, + 'MaxInput1': conv1_input_max, + 'MaxFilter1': conv1_filter_max, + 'MaxInput2': conv2_input_max, + 'MaxFilter2': conv2_filter_max, + 'MaxInput3': conv3_input_max, + 'MaxFilter3': conv3_filter_max, + } + helper.append_op(type='resnet_basic_block', + inputs=inputs, + outputs=outputs, + attrs=attrs) + return out + + +class ResNetBasicBlock(Layer): + """ + ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block. + The fusion op architecture like this: + has_shortcut = True: else: + X X + / / + | | | | + CONV1 | CONV1 | + | | | | + BN1 | BN1 | + | | | | + RELU1 | RELU1 | + | | | | + CONV2 CONV3 CONV2 | + | | | | + BN2 BN3 BN2 | + \ / \ / + ADD ADD + | | + RELU RELU + | | + Y Y + """ + + def __init__(self, + num_channels1, + num_filter1, + filter1_size, + num_channels2, + num_filter2, + filter2_size, + num_channels3, + num_filter3, + filter3_size, + stride1=1, + stride2=1, + stride3=1, + act='relu', + momentum=0.9, + eps=1e-5, + data_format='NCHW', + has_shortcut=False, + use_global_stats=False, + is_test=False, + filter1_attr=None, + scale1_attr=None, + bias1_attr=None, + moving_mean1_name=None, + moving_var1_name=None, + filter2_attr=None, + scale2_attr=None, + bias2_attr=None, + moving_mean2_name=None, + moving_var2_name=None, + filter3_attr=None, + scale3_attr=None, + bias3_attr=None, + moving_mean3_name=None, + moving_var3_name=None, + padding1=0, + padding2=0, + padding3=0, + dilation1=1, + dilation2=1, + dilation3=1, + trainable_statistics=False, + find_conv_max=True): + super(ResNetBasicBlock, self).__init__() + self._stride1 = stride1 + self._stride2 = stride2 + self._kernel1_size = utils.convert_to_list(filter1_size, 2, + 'filter1_size') + self._kernel2_size = utils.convert_to_list(filter2_size, 2, + 'filter2_size') + self._dilation1 = dilation1 + self._dilation2 = dilation2 + self._padding1 = padding1 + self._padding2 = padding2 + self._groups = 1 + self._momentum = momentum + self._eps = eps + self._data_format = data_format + self._act = act + self._has_shortcut = has_shortcut + self._use_global_stats = use_global_stats + self._is_test = is_test + self._trainable_statistics = trainable_statistics + self._find_conv_max = find_conv_max + + if has_shortcut: + self._kernel3_size = utils.convert_to_list(filter3_size, 2, + 'filter3_size') + self._padding3 = padding3 + self._stride3 = stride3 + self._dilation3 = dilation3 + else: + self._kernel3_size = None + self._padding3 = 1 + self._stride3 = 1 + self._dilation3 = 1 + + # check format + valid_format = {'NCHW'} + if data_format not in valid_format: + raise ValueError( + "conv_format must be one of {}, but got conv_format={}".format( + valid_format, data_format)) + + def _get_default_param_initializer(channels, kernel_size): + filter_elem_num = np.prod(kernel_size) * channels + std = (2.0 / filter_elem_num)**0.5 + return I.Normal(0.0, std) + + # init filter + bn_param_dtype = fluid.core.VarDesc.VarType.FP32 + bn1_param_shape = [1, 1, num_filter1] + bn2_param_shape = [1, 1, num_filter2] + filter1_shape = [num_filter1, num_channels1, filter1_size, filter1_size] + filter2_shape = [num_filter2, num_channels2, filter2_size, filter2_size] + + self.filter_1 = self.create_parameter( + shape=filter1_shape, + attr=filter1_attr, + default_initializer=_get_default_param_initializer( + num_channels1, self._kernel1_size)) + self.scale_1 = self.create_parameter( + shape=bn1_param_shape, + attr=scale1_attr, + dtype=bn_param_dtype, + default_initializer=I.Constant(1.0)) + self.bias_1 = self.create_parameter(shape=bn1_param_shape, + attr=bias1_attr, + dtype=bn_param_dtype, + is_bias=True) + self.mean_1 = self.create_parameter(attr=ParamAttr( + name=moving_mean1_name, + initializer=I.Constant(0.0), + trainable=False), + shape=bn1_param_shape, + dtype=bn_param_dtype) + self.mean_1.stop_gradient = True + self.var_1 = self.create_parameter( + attr=ParamAttr(name=moving_var1_name, + initializer=I.Constant(1.0), + trainable=False), + shape=bn1_param_shape, + dtype=bn_param_dtype) + self.var_1.stop_gradient = True + + self.filter_2 = self.create_parameter( + shape=filter2_shape, + attr=filter2_attr, + default_initializer=_get_default_param_initializer( + num_channels2, self._kernel2_size)) + self.scale_2 = self.create_parameter( + shape=bn2_param_shape, + attr=scale2_attr, + dtype=bn_param_dtype, + default_initializer=I.Constant(1.0)) + self.bias_2 = self.create_parameter(shape=bn2_param_shape, + attr=bias2_attr, + dtype=bn_param_dtype, + is_bias=True) + self.mean_2 = self.create_parameter(attr=ParamAttr( + name=moving_mean2_name, + initializer=I.Constant(0.0), + trainable=False), + shape=bn2_param_shape, + dtype=bn_param_dtype) + self.mean_2.stop_gradient = True + self.var_2 = self.create_parameter( + attr=ParamAttr(name=moving_var2_name, + initializer=I.Constant(1.0), + trainable=False), + shape=bn2_param_shape, + dtype=bn_param_dtype) + self.var_2.stop_gradient = True + + if has_shortcut: + bn3_param_shape = [1, 1, num_filter3] + filter3_shape = [ + num_filter3, num_channels3, filter3_size, filter3_size + ] + self.filter_3 = self.create_parameter( + shape=filter3_shape, + attr=filter3_attr, + default_initializer=_get_default_param_initializer( + num_channels3, self._kernel3_size)) + self.scale_3 = self.create_parameter( + shape=bn3_param_shape, + attr=scale3_attr, + dtype=bn_param_dtype, + default_initializer=I.Constant(1.0)) + self.bias_3 = self.create_parameter(shape=bn3_param_shape, + attr=bias3_attr, + dtype=bn_param_dtype, + is_bias=True) + self.mean_3 = self.create_parameter(attr=ParamAttr( + name=moving_mean3_name, + initializer=I.Constant(0.0), + trainable=False), + shape=bn3_param_shape, + dtype=bn_param_dtype) + self.mean_3.stop_gradient = True + self.var_3 = self.create_parameter(attr=ParamAttr( + name=moving_var3_name, + initializer=I.Constant(1.0), + trainable=False), + shape=bn3_param_shape, + dtype=bn_param_dtype) + self.var_3.stop_gradient = True + else: + self.filter_3 = None + self.scale_3 = None + self.bias_3 = None + self.mean_3 = None + self.var_3 = None + + def forward(self, x): + out = resnet_basic_block( + x, + self.filter_1, + self.scale_1, + self.bias_1, + self.mean_1, + self.var_1, + self.filter_2, + self.scale_2, + self.bias_2, + self.mean_2, + self.var_2, + self.filter_3, + self.scale_3, + self.bias_3, + self.mean_3, + self.var_3, + self._stride1, + self._stride2, + self._stride3, + self._padding1, + self._padding2, + self._padding3, + self._dilation1, + self._dilation2, + self._dilation3, + self._groups, + self._momentum, + self._eps, + self._data_format, + self._has_shortcut, + use_global_stats=self._use_global_stats, + training=self.training, + trainable_statistics=self._trainable_statistics, + find_conv_max=self._find_conv_max) + return out diff --git a/python/setup.py.in b/python/setup.py.in index 624218c5caf67..c02ef7f017fca 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -379,6 +379,7 @@ packages=['paddle', 'paddle.incubate.sparse.nn', 'paddle.incubate.sparse.nn.layer', 'paddle.incubate.sparse.nn.functional', + 'paddle.incubate.xpu', 'paddle.io', 'paddle.optimizer', 'paddle.nn', From fef62298c92594626929fa874d270c837033307c Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Thu, 14 Jul 2022 09:38:56 +0800 Subject: [PATCH 190/250] clean unittest.skipIf 0/N (#44285) --- .../fluid/tests/unittests/ipu/op_test_ipu.py | 79 ++++++++++++------- .../unittests/ipu/test_activation_ops_ipu.py | 2 - .../ipu/test_affine_channel_op_ipu.py | 2 - .../unittests/ipu/test_arg_max_op_ipu.py | 2 - .../unittests/ipu/test_arg_min_op_ipu.py | 2 - .../unittests/ipu/test_argsort_op_ipu.py | 2 - .../tests/unittests/ipu/test_assign_op_ipu.py | 2 - .../tests/unittests/ipu/test_avg_shard_ipu.py | 2 - .../unittests/ipu/test_batch_norm_op_ipu.py | 2 - .../ipu/test_binary_cross_entropy_op_ipu.py | 2 - .../tests/unittests/ipu/test_bmm_op_ipu.py | 2 - .../tests/unittests/ipu/test_cast_op_ipu.py | 2 - .../tests/unittests/ipu/test_clip_op_ipu.py | 2 - .../tests/unittests/ipu/test_concat_op_ipu.py | 2 - .../ipu/test_conv2d_transpose_op_ipu.py | 2 - .../tests/unittests/ipu/test_conv_op_ipu.py | 2 - .../unittests/ipu/test_dy2static_fp16_ipu.py | 37 ++++----- .../tests/unittests/ipu/test_dy2static_ipu.py | 52 +++++------- 18 files changed, 84 insertions(+), 114 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py index becaaa4173ae7..90850b56aa657 100644 --- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py @@ -67,9 +67,6 @@ def setUpClass(cls): random.seed(cls.SEED) paddle.seed(cls.SEED) - # Enable paddle static graph mode - paddle.enable_static() - @classmethod def tearDownClass(cls): """Restore random seeds""" @@ -86,43 +83,37 @@ def use_ipumodel(cls): if flag.upper() in ['1', "TRUE"]: return True - # Decorator for static graph building - def static_graph(builder): - def wrapper(self, *args, **kwargs): - self.scope = paddle.static.Scope() - self.main_prog = paddle.static.Program() - self.startup_prog = paddle.static.Program() - self.main_prog.random_seed = self.SEED - self.startup_prog.random_seed = self.SEED - with paddle.static.scope_guard(self.scope): - with paddle.utils.unique_name.guard( - paddle.utils.unique_name.generate('')): - with paddle.static.program_guard(self.main_prog, - self.startup_prog): - builder(self, *args, **kwargs) +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class IPUD2STest(IPUTest): - return wrapper - - # Cast a fp32 model to a full-fp16 model @classmethod - def cast_model_to_fp16(cls, main_program): - amp_list = paddle.static.amp.CustomOpLists() - amp_list.unsupported_list = {} - to_fp16_var_names = paddle.static.amp.cast_model_to_fp16( - main_program, amp_list, use_fp16_guard=False) - paddle.static.amp.cast_parameters_to_fp16( - paddle.CPUPlace(), - main_program, - to_fp16_var_names=to_fp16_var_names) + def setUpClass(cls): + super().setUpClass() + + # Disable paddle static graph mode + paddle.disable_static() + + def tearDown(self): + # Manual reset when using ipumodel + if self.use_ipumodel(): + paddle.framework.core.IpuBackend.get_instance().reset() +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") class IPUOpTest(IPUTest): + """Base Class for single op unit tests using static graph on IPU. + """ @classmethod def setUpClass(cls): super().setUpClass() + # Enable paddle static graph mode + paddle.enable_static() + # Items that a op_tester needs cls.main_prog: paddle.static.Program = None cls.startup_prog: paddle.static.Program = None @@ -166,6 +157,36 @@ def set_training(self): self.is_training = False self.epoch = 1 + # Decorator for static graph building + def static_graph(builder): + + def wrapper(self, *args, **kwargs): + self.scope = paddle.static.Scope() + self.main_prog = paddle.static.Program() + self.startup_prog = paddle.static.Program() + self.main_prog.random_seed = self.SEED + self.startup_prog.random_seed = self.SEED + with paddle.static.scope_guard(self.scope): + with paddle.utils.unique_name.guard( + paddle.utils.unique_name.generate('')): + with paddle.static.program_guard(self.main_prog, + self.startup_prog): + builder(self, *args, **kwargs) + + return wrapper + + # Cast a fp32 model to a full-fp16 model + @classmethod + def cast_model_to_fp16(cls, main_program): + amp_list = paddle.static.amp.CustomOpLists() + amp_list.unsupported_list = {} + to_fp16_var_names = paddle.static.amp.cast_model_to_fp16( + main_program, amp_list, use_fp16_guard=False) + paddle.static.amp.cast_parameters_to_fp16( + paddle.CPUPlace(), + main_program, + to_fp16_var_names=to_fp16_var_names) + def run_op_test(self, exec_mode, ipu_strategy=None): # NOTE: some op has no inputs # if len(self.feed_list) == 0 or len(self.fetch_list) == 0: diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py index 3c5a90afced72..97ee7a45e001c 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_ops_ipu.py @@ -21,8 +21,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py index 09a251585b381..836b99099ffe0 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_affine_channel_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py index 3612656cea354..078e744ae507d 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_min_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_min_op_ipu.py index 181f2017173b4..30c604901e877 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_arg_min_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_min_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_argsort_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_argsort_op_ipu.py index c1b585513d8b1..3f19da43c71c3 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_argsort_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_argsort_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py index af03480fbf698..93cdaf018b400 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py index 3f45bf485b817..a3be5458ad83f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py index 2d2d331543930..08e5049a790eb 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py index 121755226ec34..113412b834110 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_binary_cross_entropy_op_ipu.py @@ -21,8 +21,6 @@ import paddle.nn.functional as F -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_bmm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_bmm_op_ipu.py index 5a08774c236c2..8ea20cebf07a3 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_bmm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_bmm_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py index f361b779bb30b..6799f4141a416 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py index c61685e4a5e30..a221ad617671d 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_clip_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py index d0160551b93bd..733a5291cf50b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py index 64fdcc26636cf..6136bf34ffb67 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_conv2d_transpose_op_ipu.py @@ -20,8 +20,6 @@ from op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py index 8fe7ee53ca2a8..3fac45bbbd904 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py index 5168a6db339dc..23ba121a07f2e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py @@ -12,16 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function - import tempfile import unittest import numpy as np import paddle -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -SEED = 2022 +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUD2STest class SimpleLayer(paddle.nn.Layer): @@ -48,22 +44,19 @@ def forward(self, x, target=None): return x -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestBase(IPUOpTest): +class TestBase(IPUD2STest): - @classmethod - def setUpClass(cls): - paddle.disable_static() - cls.save_path = tempfile.TemporaryDirectory() + def setUp(self): + super().setUp() + self.save_path = tempfile.TemporaryDirectory() - @classmethod - def tearDownClass(cls): - cls.save_path.cleanup() + def tearDown(self): + super().tearDown() + self.save_path.cleanup() def _test(self, use_ipu=False): - paddle.seed(SEED) - np.random.seed(SEED) + paddle.seed(self.SEED) + np.random.seed(self.SEED) model = SimpleLayer(use_ipu) specs = [ paddle.static.InputSpec(name="x", @@ -82,7 +75,7 @@ def _test(self, use_ipu=False): self.save_path, 'ipu' if use_ipu else 'cpu') if use_ipu: - device = paddle.set_device('ipu') + paddle.set_device('ipu') ipu_strategy = paddle.static.IpuStrategy() ipu_strategy.set_graph_config(num_ipus=1, is_training=True, @@ -92,15 +85,15 @@ def _test(self, use_ipu=False): ipu_strategy.set_optimizer(optim) data = data.astype(np.float16) + epochs = 100 result = [] - for epoch in range(100): + for _ in range(epochs): # ipu only needs call model() to do forward/backward/grad_update pred, loss = model(data, label) if not use_ipu: loss.backward() optim.step() optim.clear_grad() - result.append(loss) if use_ipu: @@ -108,11 +101,10 @@ def _test(self, use_ipu=False): paddle.save(model.state_dict(), model_path) paddle.save(optim.state_dict(), optim_path) - model.set_state_dict(paddle.load(model_path)) optim.set_state_dict(paddle.load(optim_path)) - for epoch in range(100): + for _ in range(epochs): # ipu only needs call model() to do forward/backward/grad_update pred, loss = model(data, label) if not use_ipu: @@ -130,7 +122,6 @@ def _test(self, use_ipu=False): def test_training(self): cpu_loss = self._test(False).flatten() ipu_loss = self._test(True).flatten() - self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-2)) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py index 4cc9baea9f4b6..7b581de222819 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py @@ -12,21 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function - import tempfile import unittest import numpy as np import paddle from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramCache -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUD2STest from paddle.jit import to_static from paddle.optimizer.lr import LRScheduler from functools import partial -SEED = 2022 - class SimpleLayer(paddle.nn.Layer): @@ -64,12 +60,9 @@ def forward(self, x, target=None): return x -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestBase(IPUOpTest): +class TestBase(IPUD2STest): def setUp(self): - paddle.disable_static() self.set_op_attrs() self.set_data_feed() @@ -87,14 +80,14 @@ def create_model(self, use_ipu=False): use_identity_loss=use_ipu) def _test(self, use_ipu=False): - paddle.seed(SEED) - np.random.seed(SEED) + paddle.seed(self.SEED) + np.random.seed(self.SEED) model = self.create_model(use_ipu) optim = paddle.optimizer.Adam(learning_rate=0.01, parameters=model.parameters()) if use_ipu: - device = paddle.set_device('ipu') + paddle.set_device('ipu') ipu_strategy = paddle.static.IpuStrategy() ipu_strategy.set_graph_config(num_ipus=1, is_training=True, @@ -102,15 +95,15 @@ def _test(self, use_ipu=False): enable_manual_shard=False) ipu_strategy.set_optimizer(optim) + epochs = 100 result = [] - for epoch in range(100): + for _ in range(epochs): # ipu only needs call model() to do forward/backward/grad_update pred, loss = model(self.data, self.label) if not use_ipu: loss.backward() optim.step() optim.clear_grad() - result.append(loss) if use_ipu: @@ -121,23 +114,22 @@ def _test(self, use_ipu=False): def test_training(self): ipu_loss = self._test(True).flatten() cpu_loss = self._test(False).flatten() - self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4)) class TestSaveLoad(TestBase): - @classmethod - def setUpClass(cls): - cls.save_path = tempfile.TemporaryDirectory() + def setUp(self): + super().setUp() + self.save_path = tempfile.TemporaryDirectory() - @classmethod - def tearDownClass(cls): - cls.save_path.cleanup() + def tearDown(self): + super().tearDown() + self.save_path.cleanup() def _test(self, use_ipu=False): - paddle.seed(SEED) - np.random.seed(SEED) + paddle.seed(self.SEED) + np.random.seed(self.SEED) model = self.create_model(use_ipu) optim = paddle.optimizer.Adam(learning_rate=0.01, parameters=model.parameters()) @@ -147,7 +139,7 @@ def _test(self, use_ipu=False): self.save_path, 'ipu' if use_ipu else 'cpu') if use_ipu: - device = paddle.set_device('ipu') + paddle.set_device('ipu') ipu_strategy = paddle.static.IpuStrategy() ipu_strategy.set_graph_config(num_ipus=1, is_training=True, @@ -155,15 +147,15 @@ def _test(self, use_ipu=False): enable_manual_shard=False) ipu_strategy.set_optimizer(optim) + epochs = 100 result = [] - for epoch in range(100): + for _ in range(epochs): # ipu only needs call model() to do forward/backward/grad_update pred, loss = model(self.data, self.label) if not use_ipu: loss.backward() optim.step() optim.clear_grad() - result.append(loss) if use_ipu: @@ -171,18 +163,16 @@ def _test(self, use_ipu=False): paddle.save(model.state_dict(), model_path) paddle.save(optim.state_dict(), optim_path) - model.set_state_dict(paddle.load(model_path)) optim.set_state_dict(paddle.load(optim_path)) - for epoch in range(100): + for _ in range(epochs): # ipu only needs call model() to do forward/backward/grad_update pred, loss = model(self.data, self.label) if not use_ipu: loss.backward() optim.step() optim.clear_grad() - result.append(loss) if use_ipu: @@ -191,9 +181,7 @@ def _test(self, use_ipu=False): return np.array(result) -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestPatch(IPUOpTest): +class TestPatch(IPUD2STest): def setUp(cls): paddle.disable_static() From de002f9289aa53e095c3c479dffcb5aeda4ba14e Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Thu, 14 Jul 2022 09:40:29 +0800 Subject: [PATCH 191/250] hide prim2orig in executor (#44255) * hide prim2orig in executor * add some test cases without param guard * fix spell error param into program * Use absolute path when import paddle.incubate.autograd.prim2orig --- python/paddle/fluid/executor.py | 5 + .../tests/unittests/autograd/test_primapi.py | 111 ++++++++++++++++++ 2 files changed, 116 insertions(+) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 7e450710d211c..fac39df117bef 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1444,6 +1444,11 @@ def _can_use_interpreter_core(program, place): program._compile(scope, self.place) ir_graph = framework.IrGraph(program._graph) inner_program = ir_graph.to_program() + else: + from paddle.incubate.autograd import prim_enabled, prim2orig + if prim_enabled() and program == default_main_program(): + prim2orig() + program = self._add_feed_fetch_ops( program=inner_program, feed=feed, diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py index ec06eda66f8e5..09bd64ee67834 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py @@ -23,6 +23,117 @@ import utils +@utils.place(config.DEVICES) +@utils.parameterize( + (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'dtype'), + (('matmul', paddle.matmul, + (np.random.rand(2, 3), np.random.rand(3, 2)), None, 'float32'), )) +class TestWithoutProgramGuard(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.xs = tuple(x.astype(cls.dtype) for x in cls.xs) + cls._rtol = config.TOLERANCE.get(str( + cls.dtype)).get("first_order_grad").get("rtol") + cls._atol = config.TOLERANCE.get(str( + cls.dtype)).get("first_order_grad").get("atol") + + def setUp(self): + paddle.enable_static() + paddle.incubate.autograd.enable_prim() + + def tearDown(self): + paddle.incubate.autograd.disable_prim() + paddle.disable_static() + + def test_forward_grad_without_program_guard(self): + + def with_program_guard(): + paddle.incubate.autograd.enable_prim() + sp = paddle.static.Program() + mp = paddle.static.Program() + with paddle.static.program_guard(mp, sp): + feed, static_xs, static_v = utils.gen_static_data_and_feed( + self.xs, self.v, stop_gradient=False) + ys = self.fun(*static_xs) if isinstance( + static_xs, typing.Sequence) else self.fun(static_xs) + ys_grad = paddle.incubate.autograd.forward_grad( + ys, static_xs, static_v) + paddle.incubate.autograd.prim2orig(mp.block(0)) + exe = paddle.static.Executor() + exe.run(sp) + out = exe.run(mp, feed=feed, fetch_list=ys_grad) + paddle.incubate.autograd.disable_prim() + return out + + def without_program_guard(): + paddle.incubate.autograd.enable_prim() + feed, static_xs, static_v = utils.gen_static_data_and_feed( + self.xs, self.v, stop_gradient=False) + ys = self.fun(*static_xs) if isinstance( + static_xs, typing.Sequence) else self.fun(static_xs) + ys_grad = paddle.incubate.autograd.forward_grad( + ys, static_xs, static_v) + sp = paddle.fluid.framework.default_startup_program() + mp = paddle.fluid.framework.default_main_program() + exe = paddle.static.Executor() + exe.run(sp) + out = exe.run(mp, feed=feed, fetch_list=ys_grad) + paddle.incubate.autograd.disable_prim() + return out + + expected = with_program_guard() + actual = without_program_guard() + self.assertEqual(type(actual), type(expected)) + np.testing.assert_allclose(np.concatenate(actual), + np.concatenate(expected), + rtol=self._rtol, + atol=self._atol) + + def test_grad_without_program_guard(self): + + def with_program_guard(): + paddle.incubate.autograd.enable_prim() + sp = paddle.static.Program() + mp = paddle.static.Program() + with paddle.static.program_guard(mp, sp): + feed, static_xs, static_v = utils.gen_static_data_and_feed( + self.xs, self.v, stop_gradient=False) + ys = self.fun(*static_xs) if isinstance( + static_xs, typing.Sequence) else self.fun(static_xs) + xs_grad = paddle.incubate.autograd.grad(ys, static_xs, static_v) + paddle.incubate.autograd.prim2orig(mp.block(0)) + exe = paddle.static.Executor() + exe.run(sp) + out = exe.run(mp, feed=feed, fetch_list=xs_grad) + paddle.incubate.autograd.disable_prim() + return out + + def without_program_guard(): + paddle.incubate.autograd.enable_prim() + feed, static_xs, static_v = utils.gen_static_data_and_feed( + self.xs, self.v, stop_gradient=False) + ys = self.fun(*static_xs) if isinstance( + static_xs, typing.Sequence) else self.fun(static_xs) + xs_grad = paddle.incubate.autograd.grad(ys, static_xs, static_v) + sp = paddle.fluid.framework.default_startup_program() + mp = paddle.fluid.framework.default_main_program() + exe = paddle.static.Executor() + exe.run(sp) + out = exe.run(mp, feed=feed, fetch_list=xs_grad) + paddle.incubate.autograd.disable_prim() + return out + + expected = with_program_guard() + actual = without_program_guard() + for i, j in zip(actual, expected): + self.assertEqual(type(i), type(j)) + np.testing.assert_allclose(np.concatenate(i), + np.concatenate(j), + rtol=self._rtol, + atol=self._atol) + + @utils.place(config.DEVICES) @utils.parameterize( (utils.TEST_CASE_NAME, 'fun', 'xs', 'v', 'dtype'), From c0fb67f7a19405d9cba070402cc2436cdc99ce27 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Thu, 14 Jul 2022 09:40:48 +0800 Subject: [PATCH 192/250] clean unittest.skipIf 1/N (#44286) --- .../fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_dot_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py | 2 -- .../tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py | 2 -- 19 files changed, 38 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py index 5c456e2f4c331..92cf442fe27cc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py index 99cb47394ff5e..5f859b064feac 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py index 94225660f4d59..de84b94bb7dde 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_data_norm_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py index c84e8ce9bebad..5f8db4faba744 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dist_op_ipu.py @@ -20,8 +20,6 @@ from op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dot_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dot_op_ipu.py index fb090cc5913a4..ed0c36f53eb0c 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dot_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dot_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py index be96762549dd4..d104b39c29246 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py index f78f446404dcb..9c35e43970e74 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestMul(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py index ad419c2e2bfc5..77a78a7cb78ca 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py index f81f5d7de74d1..5df7bbadebf6b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_eval_model_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py index b299d9cfac728..ee68eba5e543f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_as_v2_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py index 872f4a4bef160..843ec0438d74d 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py index 77872c9ebe47d..5cb949a1943e7 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_v2_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py index 28e569d911847..74ecba6f18c86 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py index 4d4d88351892f..7598b32581acc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py index 4f84f20c1f1d5..4723f753fb698 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_contiguous_range_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py index 29dd9510dda40..d7c1da14e296f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py index 17b1bd9b2d0ea..07e0acb60a123 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_flip_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py index 0cfe769225001..708dd0f405424 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_support_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py index 42ba6babd7911..13a48e5a98f1b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): From b107eff9f035e9ea78c139058a681fc9c9300f08 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Thu, 14 Jul 2022 09:41:00 +0800 Subject: [PATCH 193/250] clean unittest.skipIf 2/N (#44287) --- python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_gradient_clip_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_inference_model_io_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py | 4 ---- .../paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_logical_not_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py | 2 -- 19 files changed, 40 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py index 673c7c0503242..2d14621d5fc7e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py index 7eea222e5e3c4..b63d176ff2791 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gradient_clip_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py index 56845eef475fa..4f2e9a1a94bfc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestGreaterThan(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py index 4c5098640fdba..dec4c6e1306a4 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py index a28120d820e5d..514b926dc82af 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_huber_loss_op_ipu.py @@ -21,8 +21,6 @@ import paddle.nn.functional as F -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py index 18cd5e30e88c1..d3a700b629647 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py @@ -21,8 +21,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py index 3828728a567c3..b24e4be7ae738 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py index 0d15f20273f04..70d01e120efc2 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_interpolate_ops_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py index 13f146f6fd741..f4a48cf134051 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py @@ -21,8 +21,6 @@ paddle.enable_static() -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestIpuShard(unittest.TestCase): def _test(self): @@ -65,8 +63,6 @@ def test_ipu_shard(self): np.allclose(ipu_index_list, expected_ipu_index_list, atol=0)) -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestIpuPipeline(unittest.TestCase): def _test(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py index 14128109029c7..6fa3d77ead8a4 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py @@ -20,8 +20,6 @@ paddle.enable_static() -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestIpuStrategy(unittest.TestCase): def test_set_options(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py index d6d48c650634d..8af6664179a97 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_kldiv_loss_op_ipu.py @@ -21,8 +21,6 @@ import paddle.nn.functional as F -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py index e365ffd4e166f..9bf457d6f924f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py index a406fa128fc5b..d8eaa2f81bceb 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py index 71a75db9ab392..79c22f47da5c9 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_x_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestLogicalAnd(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py index 27a70329ca132..ffcf8a64f53f9 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py index c15eb3a3b8edb..2c8e7159cf217 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py index f7a01b7268ddf..6c663bd5ac927 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py @@ -31,8 +31,6 @@ def get_lr(self): return self.base_lr -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestConvNet(IPUOpTest): @IPUOpTest.static_graph diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py index fb8cf86b71cd1..bf2af886959b5 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py index 8151c55326500..6ffb05dfd254b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_serilize_ipu.py @@ -26,8 +26,6 @@ def set_serialize_factor(serialize_factor): op._set_attr('serialize_factor', serialize_factor) -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): From 65c81c8a7b41bc39c88c261121ce236e12104187 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Thu, 14 Jul 2022 09:41:12 +0800 Subject: [PATCH 194/250] clean unittest.skipIf 3/N (#44290) --- .../paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py | 2 -- .../tests/unittests/ipu/test_mixed_precision_inference_ipu.py | 2 -- .../tests/unittests/ipu/test_mixed_precision_training_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_model_parallel_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_model_pipeline_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py | 4 ---- .../paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py | 2 -- 19 files changed, 40 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py index 6e84066a4a1b1..37f575f64bd99 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py index c1d144cd56443..0f60ed2485e7e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py index 4efd4c5714bf8..8c3306aed1318 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_meshgrid_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py index ba8f9c7bad51f..21bcb7b7314ab 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py @@ -21,8 +21,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py index 4fc3b40f9ab8c..4524c1103052d 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py @@ -21,8 +21,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py index 81f5295c7dda8..253a87a6b7fa6 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py index 9f7ebc52834ac..fb5f25619bf96 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py index 50be6420a5569..a5ace5f1bf1c9 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py index c796cc7c02b42..c00a60775eb7f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_not_equal_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): @@ -90,8 +88,6 @@ def set_data_feed(self): self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestScalar(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py index 6c8c3b113143a..fe5b658426eee 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py index 8822c352b8ba5..e958cfd1f89ba 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_one_hot_v2_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py index 5169eddc70307..5041e8804a085 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py @@ -19,8 +19,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py index ec333ddff01b6..bd6ff58751d3f 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_p_norm_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py index 02a488180aa0e..c006da3c16d92 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pad_op_ipu.py @@ -20,8 +20,6 @@ from op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py index a9ffeb8dc0106..8a2aa26f1c2d8 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py index e9fec9a02326d..dca1103a0cd98 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py index 3f596f951cd0c..8355f5eefde8c 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py index 0200cce0a33d0..b80560dccb3f4 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_prelu_op_ipu.py @@ -21,8 +21,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py index 1c050d1e485b8..a2da444519d29 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): From 0a848e3eef388367b6d95db07577ae59a97c0230 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Thu, 14 Jul 2022 09:41:22 +0800 Subject: [PATCH 195/250] clean unittest.skipIf 4/N (#44289) --- .../fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py | 2 -- .../tests/unittests/ipu/test_reshape_inplace_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_reshape_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_save_load_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_scale_op_ipu.py | 2 -- .../unittests/ipu/test_scaled_optimizer_state_ipu.py | 2 -- .../tests/unittests/ipu/test_set_batch_size_ipu.py | 2 -- .../tests/unittests/ipu/test_set_ipu_shard_api.py | 10 ---------- .../fluid/tests/unittests/ipu/test_slice_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_softmax_op_ipu.py | 2 -- .../ipu/test_softmax_with_cross_entropy_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_split_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_squeeze_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_stack_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_sum_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_topk_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_transpose_op_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_unary_ops_ipu.py | 2 -- .../fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py | 2 -- 19 files changed, 46 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py index 4cfbb9a5e0b58..c78165f86e21a 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestMean(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py index 9a8c127ab650c..66358d83ee680 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py index 32cedf0cdda58..2da63cf733004 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py index 1b39ead9b84a8..7c6470af3d10b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py @@ -23,8 +23,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py index 8b6b8425b5209..296d365fea602 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py index 79527f7a13081..e1f6f7a23f294 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_scaled_optimizer_state_ipu.py @@ -19,8 +19,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py index 2af8de38377b9..9bce0b5df73df 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py b/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py index a7104fd4266f6..ca1cdb4073134 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_set_ipu_shard_api.py @@ -45,8 +45,6 @@ def linear_relu2(self, x): return x -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestSetIpuShard(unittest.TestCase): def _test(self): @@ -80,8 +78,6 @@ def test_set_ipu_shard(self): np.allclose(ipu_index_list, expected_ipu_index_list, atol=0)) -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestSetIpuPipeline(unittest.TestCase): def _test(self): @@ -115,8 +111,6 @@ def test_set_ipu_shard(self): np.allclose(ipu_index_list, expected_ipu_index_list, atol=0)) -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestSetIpuShardAndPipeline(unittest.TestCase): def _test(self): @@ -157,8 +151,6 @@ def test_set_ipu_shard(self): np.allclose(ipu_index_list, expected_ipu_index_list, atol=0)) -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestSetIpuForModel(unittest.TestCase): def _test(self): @@ -194,8 +186,6 @@ def test_set_ipu_shard(self): np.allclose(ipu_index_list, expected_ipu_index_list, atol=0)) -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestSetIpuMixedModel(unittest.TestCase): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py index 3a96d4bb0b9f8..3bcbe417b9861 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py index be803e61cf533..ebc05942b9358 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py index 21021cd9f598d..d3084154a063e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_with_cross_entropy_op_ipu.py @@ -21,8 +21,6 @@ import paddle.nn.functional as F -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py index 76b65a015e95f..8d8c5190692dc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py index 1afc79b6a6586..9039dfdb3f006 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py index 1828772c07a51..fa0a48081b4a4 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py index 084c68654239c..3c4f9ff80d557 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py index 417d9c37675c3..4194887ab2f05 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestTopKOp(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py index 03068d407b2f3..d7681b38a1728 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py index eac32819f8232..bbf0f7b6996ed 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py index 998eee38b5e59..3f3b9f4f89062 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): From 325f59211900f7d6a4b8838d6662291fcc494442 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Thu, 14 Jul 2022 09:41:31 +0800 Subject: [PATCH 196/250] clean unittest.skipIf 5/N (#44288) --- .../fluid/tests/unittests/ipu/test_varname_inplace_ipu.py | 2 -- python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py | 2 -- .../paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py | 3 +-- .../fluid/tests/unittests/ipu/test_weight_sharing_ipu.py | 2 -- 4 files changed, 1 insertion(+), 8 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py index b3535c8cd5690..495bc0d656a56 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py index 8387b35015534..0e2de2817eaff 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_warpctc_op_ipu.py @@ -21,8 +21,6 @@ import paddle.nn.functional as F -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py index c2fa0e672729c..7fb467fced752 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py @@ -22,8 +22,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") @unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel") class TestBase(IPUOpTest): @@ -36,6 +34,7 @@ def setUp(self): self.model_path = os.path.join(self.temp_dir.name, "weight_decay") def tearDown(self): + super().tearDown() self.temp_dir.cleanup() def set_atol(self): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py index 52e88119af0e9..c06880b980854 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py @@ -20,8 +20,6 @@ from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") class TestWeightSharing(IPUOpTest): def setUp(self): From cb44b694ae7b958c8d8ac40da541703fa9faf580 Mon Sep 17 00:00:00 2001 From: WJJ1995 Date: Thu, 14 Jul 2022 09:53:52 +0800 Subject: [PATCH 197/250] fixed glog (#44316) --- .../framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc index c9eee31606cc3..b28b07924d888 100644 --- a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc @@ -99,10 +99,11 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct( gpd(graph, handler); AddStatis(found_elementwise_activation_count); - PrettyLogDetail("--- fused %d %s with %s activation", - found_elementwise_activation_count, - elt_type, - act_type); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d %s with %s activation", + found_elementwise_activation_count, + elt_type, + act_type); } } // namespace ir From 270ba5709774b61bc98f18b6e278b6faf5e90ccc Mon Sep 17 00:00:00 2001 From: handiz <35895648+ZhangHandi@users.noreply.github.com> Date: Thu, 14 Jul 2022 10:34:59 +0800 Subject: [PATCH 198/250] fix acc diff problem caused by pr #44116 (#44311) --- .../slim/quantization/quantization_pass.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index d3ce543320ef4..e2502e7f5d447 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -1440,11 +1440,18 @@ def apply(self, graph): [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]: continue - scale_node = graph.create_persistable_node( - name=self._scale_name(in_node.name()), - var_type=core.VarDesc.VarType.LOD_TENSOR, - shape=[1], - var_dtype=in_node.dtype()) + try: + graph._find_node_by_name( + graph.all_var_nodes(), + self._scale_name(in_node.name())) + continue + except: + scale_node = graph.create_persistable_node( + name=self._scale_name(in_node.name()), + var_type=core.VarDesc.VarType.LOD_TENSOR, + shape=[1], + var_dtype=in_node.dtype()) + data_type = 'float64' if in_node.dtype() \ == core.VarDesc.VarType.FP64 else 'float32' _init_var_node(scale_node, np.ones([1], dtype=data_type), From 84b72c5f0ee2067e0555e0884071292af4534c83 Mon Sep 17 00:00:00 2001 From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com> Date: Thu, 14 Jul 2022 10:42:19 +0800 Subject: [PATCH 199/250] add xpu pnorm op and fix pool op, *test=kunlun (#44214) --- paddle/fluid/operators/p_norm_op_xpu.cc | 354 ++++++++++++++++++ paddle/fluid/operators/pool_op_xpu.cc | 40 ++ .../fluid/platform/device/xpu/xpu2_op_list.h | 2 + .../tests/unittests/xpu/test_p_norm_op_xpu.py | 186 +++++++++ .../tests/unittests/xpu/test_pool2d_op_xpu.py | 72 ++++ 5 files changed, 654 insertions(+) create mode 100644 paddle/fluid/operators/p_norm_op_xpu.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py diff --git a/paddle/fluid/operators/p_norm_op_xpu.cc b/paddle/fluid/operators/p_norm_op_xpu.cc new file mode 100644 index 0000000000000..b37a65e794d08 --- /dev/null +++ b/paddle/fluid/operators/p_norm_op_xpu.cc @@ -0,0 +1,354 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device/xpu/xpu_header.h" + +namespace paddle { +namespace operators { + +inline void GetDims( + const phi::DDim& dim, int axis, int* m, int* t, int* n, bool asvector) { + *m = 1; + *n = 1; + *t = dim[axis]; + if (asvector) { + *t = product(dim); + } else { + for (int i = 0; i < axis; ++i) { + (*m) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*n) *= dim[i]; + } + } +} + +using Tensor = framework::Tensor; +template +class P_NormXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + float porder = ctx.Attr("porder"); + int axis = ctx.Attr("axis"); + bool asvector = ctx.Attr("asvector"); + + auto& dev_ctx = ctx.template device_context(); + auto xdim = in->dims(); + if (axis < 0) axis = xdim.size() + axis; + std::vector r_dim; + std::vector x_dim; + std::vector y_dim; + int m = 1; + int n = 1; + int t = 1; + GetDims(xdim, axis, &m, &t, &n, asvector); + x_dim.push_back(m); + x_dim.push_back(t); + x_dim.push_back(n); + + r_dim.push_back(1); + + y_dim.push_back(m); + y_dim.push_back(n); + + int r = 0; + + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + XPUType* tmp_x = RAII_GUARD.alloc_l3_or_gm(m * t * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_x); + r = xpu::abs(dev_ctx.x_context(), + reinterpret_cast(in->data()), + tmp_x, + m * t * n); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs"); + if (porder == INFINITY) { + r = xpu::reduce_max(dev_ctx.x_context(), + tmp_x, + reinterpret_cast(out->data()), + x_dim, + r_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_max"); + } else if (porder == -INFINITY) { + r = xpu::reduce_min(dev_ctx.x_context(), + tmp_x, + reinterpret_cast(out->data()), + x_dim, + r_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_min"); + } else if (porder == 0) { + XPUType* zeros = RAII_GUARD.alloc_l3_or_gm(1); + PADDLE_ENFORCE_XDNN_NOT_NULL(zeros); + r = xpu::constant(dev_ctx.x_context(), zeros, 1, 0.0f); + std::vector zeros_dim(1, 1); + + bool* tmp2_x = RAII_GUARD.alloc_l3_or_gm(m * t * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(tmp2_x); + + r = xpu::broadcast_not_equal( + dev_ctx.x_context(), tmp_x, zeros, tmp2_x, x_dim, zeros_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_not_equal"); + + XPUType* x_mid = tmp_x; + + r = xpu::cast( + dev_ctx.x_context(), tmp2_x, x_mid, m * t * n); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + + r = xpu::reduce_sum(dev_ctx.x_context(), + x_mid, + reinterpret_cast(out->data()), + x_dim, + r_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + + } else { + Tensor porder_tensor; + framework::DDim pdim = phi::make_ddim({1}); + porder_tensor.mutable_data(pdim, in->place()); + r = xpu::constant( + dev_ctx.x_context(), porder_tensor.data(), 1, porder); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + std::vector p_dim(1, 1); + + XPUType* tmp2_x = RAII_GUARD.alloc_l3_or_gm(m * t * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(tmp2_x); + r = xpu::broadcast_pow( + dev_ctx.x_context(), + reinterpret_cast(tmp_x), + reinterpret_cast(porder_tensor.data()), + tmp2_x, + x_dim, + p_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow"); + + XPUType* tmp_y = RAII_GUARD.alloc_l3_or_gm(m * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(tmp_y); + + r = xpu::reduce_sum(dev_ctx.x_context(), + reinterpret_cast(tmp2_x), + tmp_y, + x_dim, + r_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum"); + + r = xpu::constant( + dev_ctx.x_context(), porder_tensor.data(), 1, 1.0f / porder); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + + r = xpu::broadcast_pow( + dev_ctx.x_context(), + reinterpret_cast(tmp_y), + reinterpret_cast(porder_tensor.data()), + reinterpret_cast(out->data()), + y_dim, + p_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow"); + dev_ctx.Wait(); + } + } +}; + +template +class P_NormGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Out"); + auto* dy = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); + auto xdim = x->dims(); + float porder = ctx.Attr("porder"); + bool asvector = ctx.Attr("asvector"); + int axis = ctx.Attr("axis"); + axis = axis < 0 ? xdim.size() + axis : axis; + + auto& dev_ctx = ctx.template device_context(); + + int m, t, n; + GetDims(xdim, axis, &m, &t, &n, asvector); + + std::vector r_dim; + std::vector x_dim; + std::vector y_dim; + + x_dim.push_back(m); + x_dim.push_back(t); + x_dim.push_back(n); + + y_dim.push_back(m); + y_dim.push_back(1); + y_dim.push_back(n); + + int r = 0; + if (porder == 0) { + r = xpu::constant(dev_ctx.x_context(), + reinterpret_cast(dx->data()), + m * t * n, + static_cast(0)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + } else if (porder == INFINITY || porder == -INFINITY) { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + XPUType* x_abs = RAII_GUARD.alloc_l3_or_gm(m * t * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(x_abs); + r = xpu::abs(dev_ctx.x_context(), + reinterpret_cast(x->data()), + x_abs, + m * t * n); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs"); + + bool* dx_t = RAII_GUARD.alloc_l3_or_gm(m * t * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(dx_t); + + XPUType* dx_mid = RAII_GUARD.alloc_l3_or_gm(m * t * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(dx_mid); + + r = xpu::broadcast_equal( + dev_ctx.x_context(), + reinterpret_cast(x_abs), + reinterpret_cast(y->data()), + dx_t, + x_dim, + y_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_equal"); + + r = xpu::cast( + dev_ctx.x_context(), dx_t, dx_mid, m * t * n); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + + XPUType* x_sign = RAII_GUARD.alloc_l3_or_gm(m * t * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(x_sign); + r = xpu::sign(dev_ctx.x_context(), + reinterpret_cast(x->data()), + x_sign, + m * t * n); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign"); + + XPUType* dx_pre_dy = x_abs; + r = xpu::mul(dev_ctx.x_context(), + reinterpret_cast(dx_mid), + reinterpret_cast(x_sign), + dx_pre_dy, + m * t * n); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul"); + + r = xpu::broadcast_mul(dev_ctx.x_context(), + dx_pre_dy, + reinterpret_cast(dy->data()), + reinterpret_cast(dx->data()), + x_dim, + y_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul"); + + } else { + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + XPUType* x_abs = RAII_GUARD.alloc_l3_or_gm(m * t * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(x_abs); + r = xpu::abs(dev_ctx.x_context(), + reinterpret_cast(x->data()), + x_abs, + m * t * n); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "abs"); + + Tensor porder_tensor; + framework::DDim pdim = phi::make_ddim({1}); + porder_tensor.mutable_data(pdim, x->place()); + r = xpu::constant( + dev_ctx.x_context(), porder_tensor.data(), 1, porder - 1.0f); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + std::vector p_dim(1, 1); + + XPUType* x_pow = RAII_GUARD.alloc_l3_or_gm(m * t * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(x_pow); + r = xpu::broadcast_pow( + dev_ctx.x_context(), + reinterpret_cast(x_abs), + reinterpret_cast(porder_tensor.data()), + x_pow, + x_dim, + p_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow"); + + XPUType* y_pow = RAII_GUARD.alloc_l3_or_gm(m * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(y_pow); + r = xpu::broadcast_pow( + dev_ctx.x_context(), + reinterpret_cast(y->data()), + reinterpret_cast(porder_tensor.data()), + y_pow, + y_dim, + p_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_pow"); + dev_ctx.Wait(); + + XPUType* dx_t = x_abs; + + r = xpu::broadcast_div( + dev_ctx.x_context(), x_pow, y_pow, dx_t, x_dim, y_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_div"); + + XPUType* x_sign = x_pow; + r = xpu::sign(dev_ctx.x_context(), + reinterpret_cast(x->data()), + x_sign, + m * t * n); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sign"); + + XPUType* dx_mid = RAII_GUARD.alloc_l3_or_gm(m * t * n); + PADDLE_ENFORCE_XDNN_NOT_NULL(dx_mid); + + r = xpu::broadcast_mul(dev_ctx.x_context(), + reinterpret_cast(x_sign), + reinterpret_cast(dy->data()), + dx_mid, + x_dim, + y_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul"); + + r = xpu::broadcast_mul(dev_ctx.x_context(), + reinterpret_cast(dx_t), + reinterpret_cast(dx_mid), + reinterpret_cast(dx->data()), + x_dim, + x_dim); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul"); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + p_norm, ops::P_NormXPUKernel); +REGISTER_OP_XPU_KERNEL( + p_norm_grad, + ops::P_NormGradXPUKernel); + +#endif diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc index d8d814a6ba78a..7208b195b4600 100644 --- a/paddle/fluid/operators/pool_op_xpu.cc +++ b/paddle/fluid/operators/pool_op_xpu.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/phi/kernels/funcs/pooling.h" #ifdef PADDLE_WITH_XPU namespace paddle { @@ -51,6 +52,9 @@ class PoolXPUKernel : public framework::OpKernel { std::vector paddings = context.Attr>("paddings"); bool exclusive = context.Attr("exclusive"); bool adaptive = context.Attr("adaptive"); + bool ceil_mode = context.Attr("ceil_mode"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); PADDLE_ENFORCE_EQ( ksize.size(), 2, @@ -70,10 +74,27 @@ class PoolXPUKernel : public framework::OpKernel { ksize[i] = static_cast(in_x->dims()[i + 2]); } } + const int n = in_x->dims()[0]; const int c = in_x->dims()[1]; const int in_h = in_x->dims()[2]; const int in_w = in_x->dims()[3]; + + framework::DDim data_dims; + + data_dims = phi::slice_ddim(in_x->dims(), 2, in_x->dims().size()); + phi::funcs::UpdatePadding(&paddings, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + ksize); + if (ceil_mode) { + paddings[1] += (strides[0] - 1); + paddings[3] += (strides[1] - 1); + } + auto input = reinterpret_cast(in_x->data()); out->mutable_data(context.GetPlace()); auto output = reinterpret_cast(out->data()); @@ -135,6 +156,9 @@ class PoolGradXPUKernel : public framework::OpKernel { std::vector paddings = context.Attr>("paddings"); bool exclusive = context.Attr("exclusive"); bool adaptive = context.Attr("adaptive"); + bool ceil_mode = context.Attr("ceil_mode"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); const int* index_data = nullptr; PADDLE_ENFORCE_EQ( ksize.size(), @@ -163,6 +187,22 @@ class PoolGradXPUKernel : public framework::OpKernel { const int c = in_x->dims()[1]; const int in_h = in_x->dims()[2]; const int in_w = in_x->dims()[3]; + + framework::DDim data_dims; + + data_dims = phi::slice_ddim(in_x->dims(), 2, in_x->dims().size()); + phi::funcs::UpdatePadding(&paddings, + global_pooling, + adaptive, + padding_algorithm, + data_dims, + strides, + ksize); + if (ceil_mode) { + paddings[1] += (strides[0] - 1); + paddings[3] += (strides[1] - 1); + } + auto input = reinterpret_cast(in_x->data()); auto output = reinterpret_cast(out->data()); auto output_grad = reinterpret_cast(out_grad->data()); diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index 4a6f07b76ba57..a3165bc989384 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -323,6 +323,8 @@ XPUOpMap& get_kl2_ops() { {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::INT64, XPUPlace())})}, + {"p_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"p_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, diff --git a/python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py new file mode 100644 index 0000000000000..049896527b940 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py @@ -0,0 +1,186 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numpy as np +import sys +import unittest +from functools import reduce + +sys.path.append("..") +from op_test import OpTest +from op_test_xpu import XPUOpTest +from operator import mul +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +def ref_p_norm(x, axis, porder, keepdims=False, reduce_all=False): + r = [] + if axis is None or reduce_all: + x = x.flatten() + if porder == np.inf: + r = np.amax(np.abs(x), keepdims=keepdims) + elif porder == -np.inf: + r = np.amin(np.abs(x), keepdims=keepdims) + else: + r = np.linalg.norm(x, ord=porder, keepdims=keepdims) + elif isinstance(axis, list or tuple) and len(axis) == 2: + if porder == np.inf: + axis = tuple(axis) + r = np.amax(np.abs(x), axis=axis, keepdims=keepdims) + elif porder == -np.inf: + axis = tuple(axis) + r = np.amin(np.abs(x), axis=axis, keepdims=keepdims) + elif porder == 0: + axis = tuple(axis) + r = x.astype(bool) + r = np.sum(r, axis, keepdims=keepdims) + elif porder == 1: + axis = tuple(axis) + r = np.sum(np.abs(x), axis, keepdims=keepdims) + else: + axis = tuple(axis) + xp = np.power(np.abs(x), porder) + s = np.sum(xp, axis=axis, keepdims=keepdims) + r = np.power(s, 1.0 / porder) + else: + if isinstance(axis, list): + axis = tuple(axis) + r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims) + r = r.astype(x.dtype) + + return r + + +class XPUTestPNormOp(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'p_norm' + self.use_dynamic_create_class = False + + class TestXPUPNormOp(XPUOpTest): + + def setUp(self): + self.op_type = "p_norm" + self.dtype = self.in_type + self.shape = [2, 3, 4, 5] + self.epsilon = 1e-12 + self.axis = 1 + self.porder = 2.0 + self.asvector = False + self.keepdims = False + self.set_attrs() + np.random.seed(12345) + + x_np = np.random.uniform(-10, 10, self.shape).astype(self.dtype) + + ref_y_np = ref_p_norm(x_np, self.axis, self.porder, self.keepdims, + self.asvector) + self.inputs = {'X': x_np} + self.outputs = {'Out': ref_y_np} + self.attrs = { + 'epsilon': self.epsilon, + 'axis': self.axis, + 'porder': float(self.porder), + 'asvector': self.asvector + } + + def set_attrs(self): + pass + + def test_check_output(self): + self.check_output_with_place(paddle.XPUPlace(0), atol=1e-4) + + def test_check_grad(self): + self.check_grad_with_place(paddle.XPUPlace(0), ['X'], 'Out') + + class TestPnormOp2(TestXPUPNormOp): + + def set_attrs(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.porder = 2.0 + + class TestPnormOp3(TestXPUPNormOp): + + def set_attrs(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.porder = np.inf + + class TestPnormOp4(TestXPUPNormOp): + + def set_attrs(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.porder = -np.inf + + class TestPnormOp5(TestXPUPNormOp): + + def set_attrs(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.porder = 0 + + class TestPnormOp6(TestXPUPNormOp): + + def set_attrs(self): + self.shape = [3, 20, 3] + self.axis = -1 + self.porder = 2 + + class TestPnormOp7(TestXPUPNormOp): + + def set_attrs(self): + self.shape = [3, 20, 3, 10] + self.axis = 2 + self.porder = 2.0 + + class TestPnormOp8(TestXPUPNormOp): + + def set_attrs(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.porder = np.inf + + class TestPnormOp9(TestXPUPNormOp): + + def set_attrs(self): + self.shape = [3, 20, 3, 10] + self.axis = 1 + self.porder = -np.inf + + class TestPnormOp10(TestXPUPNormOp): + + def set_attrs(self): + self.shape = [3, 20, 3, 10] + self.axis = 2 + self.porder = 0 + + class TestPnormOp11(TestXPUPNormOp): + + def set_attrs(self): + self.shape = [3, 20, 3, 10] + self.axis = -1 + self.porder = 2 + + +support_types = get_xpu_op_support_types('p_norm') +for stype in support_types: + create_test_class(globals(), XPUTestPNormOp, stype) + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py index 5ab62af7104e9..0d7121144adab 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py @@ -297,6 +297,7 @@ def setUp(self): 'exclusive': self.exclusive, 'adaptive': self.adaptive, "padding_algorithm": self.padding_algorithm, + 'ceil_mode': self.ceil_mode } self.outputs = {'Out': output} @@ -469,6 +470,77 @@ def init_test_case(self): def init_shape(self): self.shape = [2, 3, 7, 7] + class TestCaseCeil1(TestPool2D_Op): + + def init_test_case(self): + self.ksize = [3, 3] + self.strides = [1, 1] + + def init_paddings(self): + self.paddings = [0, 0] + + def init_pool_type(self): + self.pool_type = "avg" + self.pool2D_forward_naive = avg_pool2D_forward_naive + + def init_global_pool(self): + self.global_pool = False + + def init_shape(self): + self.shape = [2, 3, 7, 7] + + def init_ceil_mode(self): + self.ceil_mode = True + + class TestCaseCeil2(TestPool2D_Op): + + def init_test_case(self): + self.ksize = [3, 3] + self.strides = [1, 1] + + def init_paddings(self): + self.paddings = [1, 1] + + def init_pool_type(self): + self.pool_type = "avg" + self.pool2D_forward_naive = avg_pool2D_forward_naive + + def init_global_pool(self): + self.global_pool = False + + def init_shape(self): + self.shape = [2, 3, 7, 7] + + def init_ceil_mode(self): + self.ceil_mode = True + + class TestCaseCeil3(TestPool2D_Op): + + def init_pool_type(self): + self.pool_type = "max" + self.pool2D_forward_naive = max_pool2D_forward_naive + + def init_ceil_mode(self): + self.ceil_mode = True + + class TestCaseCeil4(TestCaseCeil1): + + def init_pool_type(self): + self.pool_type = "max" + self.pool2D_forward_naive = max_pool2D_forward_naive + + def init_ceil_mode(self): + self.ceil_mode = True + + class TestCaseCeil5(TestCaseCeil2): + + def init_pool_type(self): + self.pool_type = "max" + self.pool2D_forward_naive = max_pool2D_forward_naive + + def init_ceil_mode(self): + self.ceil_mode = True + support_types = get_xpu_op_support_types('pool2d') for stype in support_types: From d15b490ad6ce251a1c3ef1386f73e7da824a807c Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Thu, 14 Jul 2022 10:43:44 +0800 Subject: [PATCH 200/250] [operator migration] Migrate merged momentum cpu/gpu kernels (#44300) --- .../optimizers/merged_momentum_op.cc | 6 +- .../operators/optimizers/merged_momentum_op.h | 370 ---------------- .../optimizers/merged_momentum_op_mlu.cc | 8 +- .../optimizers/merged_momentum_op_npu.cc | 7 +- .../pow2_decay_with_linear_warmup_op.h | 2 +- paddle/fluid/platform/macros.h | 6 - paddle/phi/core/macros.h | 6 + .../kernels/cpu/merged_momentum_kernel.cc} | 20 +- .../phi/kernels/gpu/merged_momentum_kernel.cu | 25 ++ .../phi/kernels/impl/merged_momentum_impl.h | 400 ++++++++++++++++++ paddle/phi/kernels/merged_momentum_kernel.h | 42 ++ paddle/phi/ops/compat/merged_momentum_sig.cc | 40 ++ 12 files changed, 538 insertions(+), 394 deletions(-) delete mode 100644 paddle/fluid/operators/optimizers/merged_momentum_op.h rename paddle/{fluid/operators/optimizers/merged_momentum_op.cu => phi/kernels/cpu/merged_momentum_kernel.cc} (55%) create mode 100644 paddle/phi/kernels/gpu/merged_momentum_kernel.cu create mode 100644 paddle/phi/kernels/impl/merged_momentum_impl.h create mode 100644 paddle/phi/kernels/merged_momentum_kernel.h create mode 100644 paddle/phi/ops/compat/merged_momentum_sig.cc diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc index e6aec5cec9e66..220c0be9ddf0f 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -103,7 +103,3 @@ namespace plat = paddle::platform; REGISTER_OP_WITHOUT_GRADIENT(merged_momentum, ops::MergedMomentumOp, ops::MergedMomentumOpMaker); - -REGISTER_OP_CPU_KERNEL(merged_momentum, - ops::MergedMomentumOpKernel, - ops::MergedMomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.h b/paddle/fluid/operators/optimizers/merged_momentum_op.h deleted file mode 100644 index 77c8f3dbd3555..0000000000000 --- a/paddle/fluid/operators/optimizers/merged_momentum_op.h +++ /dev/null @@ -1,370 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/kernels/impl/momentum_kernel_impl.h" - -namespace paddle { -namespace operators { - -template -using MultiPrecisionType = typename details::MPTypeTrait::Type; - -template -struct MergedMomentumMasterParams { - MT *PADDLE_RESTRICT master_params[kParamNum]; - - HOSTDEVICE MT *MasterParam(size_t idx) const { return master_params[idx]; } - HOSTDEVICE void SetMasterParam(size_t idx, MT *p) { master_params[idx] = p; } -}; - -template -struct MergedMomentumMasterParams { - HOSTDEVICE constexpr MT *MasterParam(size_t) const { return nullptr; } - HOSTDEVICE constexpr void SetMasterParam(size_t, MT *) {} -}; - -template -struct MergedMomentumKernelParam - : public MergedMomentumMasterParams { - static constexpr auto N = kParamNum; - size_t sizes[N]; - T *PADDLE_RESTRICT params[N]; - const T *PADDLE_RESTRICT grads[N]; - MT *PADDLE_RESTRICT velocitys[N]; - const MultiPrecisionType *PADDLE_RESTRICT lr; - MT mu; - MT rescale_grad; - uint32_t param_num; - - HOSTDEVICE void operator()(size_t i) const { - const MT lr_val = static_cast(*lr); - for (uint32_t idx = 0; idx < param_num; ++idx) { - auto size = sizes[idx]; - if (i >= size) continue; - - auto param_p = params[idx]; - auto grad_p = grads[idx]; - auto velocity_p = velocitys[idx]; - auto master_param_p = this->MasterParam(idx); - - const MT param = - master_param_p ? master_param_p[i] : static_cast(param_p[i]); - const MT grad = static_cast(grad_p[i]) * rescale_grad; - const MT velocity = velocity_p[i]; - const MT velocity_out = velocity * mu + grad; - const MT param_out = param - lr_val * velocity_out; - velocity_p[i] = velocity_out; - param_p[i] = static_cast(param_out); - if (master_param_p) { - master_param_p[i] = param_out; - } - } - } -}; - -template -class MergedMomentumOpKernel : public framework::OpKernel { - using MPType = typename operators::details::MPTypeTrait::Type; - - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const bool multi_precision = ctx.Attr("multi_precision"); - if (multi_precision) { - InnerCompute(ctx, multi_precision); - } else { - InnerCompute(ctx, multi_precision); - } - } - - private: - template - void InnerCompute(const framework::ExecutionContext &ctx, - const bool multi_precision) const { - auto params = ctx.MultiInput("Param"); - auto params_out = ctx.MultiOutput("ParamOut"); - size_t n = params.size(); - PADDLE_ENFORCE_EQ(n, - params_out.size(), - platform::errors::InvalidArgument( - "The size of Output(ParamOut) must be equal to " - "Input(Param), but got the size of Output(ParamOut) " - "is %d, the size of Input(Param) is %d.", - params_out.size(), - n)); - for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_EQ(params[i], - params_out[i], - platform::errors::InvalidArgument( - "The size of Input(Param) and Output(ParamOut) " - "must be the same Tensors.")); - } - - auto grads = ctx.MultiInput("Grad"); - PADDLE_ENFORCE_EQ( - n, - grads.size(), - platform::errors::InvalidArgument( - "The size of Input(Grad) must be equal to Input(Param), but got " - "the size of Input(Grad) is %d, the size of Input(Param) is %d.", - grads.size(), - n)); - - auto velocitys = ctx.MultiInput("Velocity"); - PADDLE_ENFORCE_EQ(n, - velocitys.size(), - platform::errors::InvalidArgument( - "The size of Input(Velocity) must be equal to " - "Input(Param), but got the size of Input(Velocity) " - "is %d, the size of Input(Param) is %d.", - velocitys.size(), - n)); - - auto velocitys_out = ctx.MultiOutput("VelocityOut"); - PADDLE_ENFORCE_EQ( - n, - velocitys_out.size(), - platform::errors::InvalidArgument( - "The size of Output(VelocityOut) must be " - "equal to Input(Param), but got the size of Output(VelocityOut) is " - "%d, the size of Input(Param) is %d.", - velocitys_out.size(), - n)); - for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_EQ(velocitys[i], - velocitys_out[i], - platform::errors::InvalidArgument( - "Input(Velocity) and Output(VelocityOut) must be " - "the same Tensors.")); - } - - auto master_params = ctx.MultiInput("MasterParam"); - auto master_params_out = - ctx.MultiOutput("MasterParamOut"); - if (multi_precision) { - PADDLE_ENFORCE_EQ( - n, - master_params.size(), - platform::errors::InvalidArgument( - "The size of Input(MasterParam) must be " - "equal to Input(Param), but got the size of Input(MasterParam) " - "is %d, the size of Input(Param) is %d.", - master_params.size(), - n)); - PADDLE_ENFORCE_EQ( - n, - master_params_out.size(), - platform::errors::InvalidArgument( - "The size of Output(MasterParamOut) must be equal to " - "Input(MasterParam), but got the size of Output(MasterParamOut) " - "is %d, the size of Input(Param) is %d.", - master_params_out.size(), - n)); - for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_EQ(master_params[i], - master_params_out[i], - platform::errors::InvalidArgument( - "Input(MasterParam) and Output(MasterParamOut) " - "must be the same Tensors.")); - PADDLE_ENFORCE_NOT_NULL(master_params[i], - platform::errors::InvalidArgument( - "Input(MasterParam) must be provided when " - "multi_precision=True.")); - } - } else { - master_params.clear(); - master_params_out.clear(); - } - - auto mu = ctx.Attr("mu"); - auto rescale_grad = ctx.Attr("rescale_grad"); - auto lrs = ctx.MultiInput("LearningRate"); - if (lrs.size() != 1) { - PADDLE_ENFORCE_EQ( - n, - lrs.size(), - platform::errors::InvalidArgument( - "If the size of Input(LearningRate) is not 1, the size of " - "Input(LearningRate) must be " - "equal to Input(Param), but got the size of Input(LearningRate) " - "is %d, the size of Input(Param) is %d.", - lrs.size(), - n)); - } - auto use_nesterov = ctx.Attr("use_nesterov"); - auto regularization_methods = - ctx.Attr>("regularization_method"); - auto regularization_coeffs = - ctx.Attr>("regularization_coeff"); - if (regularization_methods.size() != 0) { - PADDLE_ENFORCE_EQ( - n, - regularization_methods.size(), - platform::errors::InvalidArgument( - "The size of Attr(regularization_method) must be equal " - "to Input(Param), but got the size of " - "Attr(regularization_method) is %d, the size of Input(Param) is " - "%d.", - regularization_methods.size(), - n)); - PADDLE_ENFORCE_EQ( - n, - regularization_coeffs.size(), - platform::errors::InvalidArgument( - "The size of Attr(regularization_coeff) must be equal " - "to Input(Param), but got the size of Attr(regularization_coeff) " - "is %d, the size of Input(Param) is %d.", - regularization_coeffs.size(), - n)); - } - - VLOG(5) << "use_nesterov: " << use_nesterov - << ", regularization_methods.size(): " - << regularization_methods.size() - << ", regularization_coeffs.size(): " - << regularization_coeffs.size(); - - auto &dev_ctx = ctx.template device_context(); - - if (lrs.size() == 1 && use_nesterov == false && - regularization_methods.size() == 0) { -#define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision) \ - MergedMomentumKernelParam kernel_params; \ - constexpr auto kMaxMergedNum = decltype(kernel_params)::N; \ - size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum; \ - kernel_params.mu = static_cast(mu); \ - kernel_params.rescale_grad = static_cast(rescale_grad); \ - kernel_params.lr = lrs[0]->data(); \ - for (size_t i = 0; i < kernel_num; ++i) { \ - size_t start = i * kMaxMergedNum; \ - size_t end = std::min((i + 1) * kMaxMergedNum, n); \ - kernel_params.param_num = static_cast(end - start); \ - size_t max_size = 0; \ - for (size_t j = 0; j < kernel_params.param_num; ++j) { \ - auto size = static_cast(params_out[j + start]->numel()); \ - max_size = std::max(max_size, size); \ - kernel_params.sizes[j] = size; \ - kernel_params.params[j] = params_out[j + start]->data(); \ - kernel_params.grads[j] = grads[j + start]->data(); \ - kernel_params.velocitys[j] = velocitys_out[j + start]->data(); \ - kernel_params.SetMasterParam( \ - j, \ - kMultiPrecision ? master_params_out[j + start]->data() \ - : nullptr); \ - } \ - platform::ForRange for_range(dev_ctx, max_size); \ - for_range(kernel_params); \ - VLOG(10) << "Launch MergedMomentum kernel " << i << " " \ - << kernel_params.param_num; \ - } - if (multi_precision) { - PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true); - } else { - PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false); - } -#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL - } else { - for (size_t idx = 0; idx < n; idx++) { - phi::RegularizationType regularization_flag = - regularization_methods.size() > 0 && - regularization_methods[idx] == "l2_decay" - ? phi::RegularizationType::kL2DECAY - : phi::RegularizationType::kNONE; - - MT regularization_coeff = static_cast(0.0); - if (regularization_coeffs.size() != 0) { - regularization_coeff = static_cast(regularization_coeffs[idx]); - } - auto lr_temp = lrs.size() > 1 ? lrs[idx] : lrs[0]; - - const MT *master_in_data = - multi_precision ? master_params[idx]->data() : nullptr; - MT *master_out_data = - multi_precision ? master_params_out[idx]->data() : nullptr; - if (platform::is_cpu_place(ctx.GetPlace())) { - phi::CPUDenseMomentumFunctor functor; - functor(params[idx], - grads[idx], - velocitys[idx], - lr_temp, - static_cast(mu), - use_nesterov, - regularization_flag, - regularization_coeff, - params_out[idx], - velocitys_out[idx]); - VLOG(10) << "Launch MergedMomentum cpu kernel."; - } else if (platform::is_gpu_place(ctx.GetPlace())) { - platform::ForRange for_range( - static_cast(ctx.device_context()), - params[idx]->numel()); -#define PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(__nesterov, __reg_type) \ - phi::DenseMomentumFunctor functor( \ - params[idx]->data(), \ - grads[idx]->data(), \ - velocitys[idx]->data(), \ - lr_temp->data(), \ - master_in_data, \ - static_cast(mu), \ - static_cast(rescale_grad), \ - params[idx]->numel(), \ - regularization_coeff, \ - params_out[idx]->data(), \ - velocitys_out[idx]->data(), \ - master_out_data); \ - for_range(functor); - if (use_nesterov) { - if (regularization_flag == phi::RegularizationType::kL2DECAY) { - PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL( - phi::UseNesterov, phi::RegularizationType::kL2DECAY); - VLOG(10) - << "Launch MergedMomentum gpu kernel use_nesterov kL2DECAY."; - } else { - PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL( - phi::UseNesterov, phi::RegularizationType::kNONE); - VLOG(10) - << "Launch MergedMomentum gpu kernel use_nesterov kNONE."; - } - } else { - if (regularization_flag == phi::RegularizationType::kL2DECAY) { - PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL( - phi::NoNesterov, phi::RegularizationType::kL2DECAY); - VLOG(10) - << "Launch MergedMomentum gpu kernel no_nesterov kL2DECAY."; - } else { - PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL( - phi::NoNesterov, phi::RegularizationType::kNONE); - VLOG(10) << "Launch MergedMomentum gpu kernel no_nesterov kNONE."; - } - } - } - } - VLOG(10) - << "Launch MergedMomentum kernel with multi_lr and regularization."; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc index 32af057ecd417..90faf8f389a89 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc +++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc @@ -12,8 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" -#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/phi/kernels/impl/momentum_kernel_impl.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc index ff131138e8a6f..38479d6dba22e 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc +++ b/paddle/fluid/operators/optimizers/merged_momentum_op_npu.cc @@ -12,8 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/fluid/platform/macros.h" #include "paddle/phi/kernels/impl/momentum_kernel_impl.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h index 60274f6b667da..d3d2e48fdcd6c 100644 --- a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/for_range.h" -#include "paddle/fluid/platform/macros.h" +#include "paddle/phi/core/macros.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/platform/macros.h b/paddle/fluid/platform/macros.h index 9eede99b7b733..2ea58a7bb0c81 100644 --- a/paddle/fluid/platform/macros.h +++ b/paddle/fluid/platform/macros.h @@ -29,9 +29,3 @@ limitations under the License. */ #define FLT_MAX __FLT_MAX__ #endif // __FLT_MAX__ #endif // PADDLE_WITH_MUSL - -#if defined(__NVCC__) || defined(__HIPCC__) -#define PADDLE_RESTRICT __restrict__ -#else -#define PADDLE_RESTRICT -#endif diff --git a/paddle/phi/core/macros.h b/paddle/phi/core/macros.h index 8049d027a77b8..e48f7342e456e 100644 --- a/paddle/phi/core/macros.h +++ b/paddle/phi/core/macros.h @@ -53,4 +53,10 @@ namespace phi { #define PD_CONCATENATE2(arg1, arg2) arg1##arg2 #define PD_EXPAND(x) x +#if defined(__NVCC__) || defined(__HIPCC__) +#define PADDLE_RESTRICT __restrict__ +#else +#define PADDLE_RESTRICT +#endif + } // namespace phi diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cu b/paddle/phi/kernels/cpu/merged_momentum_kernel.cc similarity index 55% rename from paddle/fluid/operators/optimizers/merged_momentum_op.cu rename to paddle/phi/kernels/cpu/merged_momentum_kernel.cc index 7e4bbd9807938..0751711ef64fe 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op.cu +++ b/paddle/phi/kernels/cpu/merged_momentum_kernel.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/merged_momentum_impl.h" -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - merged_momentum, - ops::MergedMomentumOpKernel, - ops::MergedMomentumOpKernel, - ops::MergedMomentumOpKernel); +PD_REGISTER_KERNEL(merged_momentum, + CPU, + ALL_LAYOUT, + phi::MergedMomentumKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/merged_momentum_kernel.cu b/paddle/phi/kernels/gpu/merged_momentum_kernel.cu new file mode 100644 index 0000000000000..c6883caecd1a6 --- /dev/null +++ b/paddle/phi/kernels/gpu/merged_momentum_kernel.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/merged_momentum_impl.h" + +PD_REGISTER_KERNEL(merged_momentum, + GPU, + ALL_LAYOUT, + phi::MergedMomentumKernel, + phi::dtype::float16, + float, + double) {} diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h new file mode 100644 index 0000000000000..2972a93d10858 --- /dev/null +++ b/paddle/phi/kernels/impl/merged_momentum_impl.h @@ -0,0 +1,400 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/macros.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/impl/momentum_kernel_impl.h" +#include "paddle/phi/kernels/merged_momentum_kernel.h" + +namespace phi { + +template +using MultiPrecisionType = typename phi::dtype::MPTypeTrait::Type; + +template +struct MergedMomentumMasterParams { + MT *PADDLE_RESTRICT master_params[kParamNum]; + + HOSTDEVICE MT *MasterParam(size_t idx) const { return master_params[idx]; } + HOSTDEVICE void SetMasterParam(size_t idx, MT *p) { master_params[idx] = p; } +}; + +template +struct MergedMomentumMasterParams { + HOSTDEVICE constexpr MT *MasterParam(size_t) const { return nullptr; } + HOSTDEVICE constexpr void SetMasterParam(size_t, MT *) {} +}; + +template +struct MergedMomentumKernelParam + : public MergedMomentumMasterParams { + static constexpr auto N = kParamNum; + size_t sizes[N]; + T *PADDLE_RESTRICT params[N]; + const T *PADDLE_RESTRICT grads[N]; + MT *PADDLE_RESTRICT velocitys[N]; + const MultiPrecisionType *PADDLE_RESTRICT lr; + MT mu; + MT rescale_grad; + uint32_t param_num; + + HOSTDEVICE void operator()(size_t i) const { + const MT lr_val = static_cast(*lr); + for (uint32_t idx = 0; idx < param_num; ++idx) { + auto size = sizes[idx]; + if (i >= size) continue; + + auto param_p = params[idx]; + auto grad_p = grads[idx]; + auto velocity_p = velocitys[idx]; + auto master_param_p = this->MasterParam(idx); + + const MT param = + master_param_p ? master_param_p[i] : static_cast(param_p[i]); + const MT grad = static_cast(grad_p[i]) * rescale_grad; + const MT velocity = velocity_p[i]; + const MT velocity_out = velocity * mu + grad; + const MT param_out = param - lr_val * velocity_out; + velocity_p[i] = velocity_out; + param_p[i] = static_cast(param_out); + if (master_param_p) { + master_param_p[i] = param_out; + } + } + } +}; + +template +void MergedMomentumInnerCompute( + const Context &ctx, + const std::vector ¶ms, + const std::vector &grads, + const std::vector &velocitys, + const std::vector &lrs, + const paddle::optional> &master_params_opt, + float mu, + bool use_nesterov, + const std::vector ®ularization_methods, + const std::vector ®ularization_coeffs, + float rescale_grad, + const bool multi_precision, + std::vector params_out, + std::vector velocitys_out, + std::vector master_params_out) { + size_t n = params.size(); + PADDLE_ENFORCE_EQ(n, + params_out.size(), + phi::errors::InvalidArgument( + "The size of Output(ParamOut) must be equal to " + "Input(Param), but got the size of Output(ParamOut) " + "is %d, the size of Input(Param) is %d.", + params_out.size(), + n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ( + params[i], + params_out[i], + phi::errors::InvalidArgument("Input(Param) and Output(ParamOut) " + "must be the same Tensors.")); + } + + PADDLE_ENFORCE_EQ( + n, + grads.size(), + phi::errors::InvalidArgument( + "The size of Input(Grad) must be equal to Input(Param), but got " + "the size of Input(Grad) is %d, the size of Input(Param) is %d.", + grads.size(), + n)); + + PADDLE_ENFORCE_EQ(n, + velocitys.size(), + phi::errors::InvalidArgument( + "The size of Input(Velocity) must be equal to " + "Input(Param), but got the size of Input(Velocity) " + "is %d, the size of Input(Param) is %d.", + velocitys.size(), + n)); + + PADDLE_ENFORCE_EQ( + n, + velocitys_out.size(), + phi::errors::InvalidArgument( + "The size of Output(VelocityOut) must be " + "equal to Input(Param), but got the size of Output(VelocityOut) is " + "%d, the size of Input(Param) is %d.", + velocitys_out.size(), + n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(velocitys[i], + velocitys_out[i], + phi::errors::InvalidArgument( + "Input(Velocity) and Output(VelocityOut) must be " + "the same Tensors.")); + } + + if (multi_precision) { + auto master_params = master_params_opt.get(); + PADDLE_ENFORCE_EQ( + n, + master_params.size(), + phi::errors::InvalidArgument( + "The size of Input(MasterParam) must be " + "equal to Input(Param), but got the size of Input(MasterParam) " + "is %d, the size of Input(Param) is %d.", + master_params.size(), + n)); + PADDLE_ENFORCE_EQ( + n, + master_params_out.size(), + phi::errors::InvalidArgument( + "The size of Output(MasterParamOut) must be equal to " + "Input(MasterParam), but got the size of Output(MasterParamOut) " + "is %d, the size of Input(Param) is %d.", + master_params_out.size(), + n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(master_params[i], + master_params_out[i], + phi::errors::InvalidArgument( + "Input(MasterParam) and Output(MasterParamOut) " + "must be the same Tensors.")); + PADDLE_ENFORCE_NOT_NULL(master_params[i], + phi::errors::InvalidArgument( + "Input(MasterParam) must be provided when " + "multi_precision=True.")); + } + } else { + master_params_out.clear(); + } + + if (lrs.size() != 1) { + PADDLE_ENFORCE_EQ( + n, + lrs.size(), + phi::errors::InvalidArgument( + "If the size of Input(LearningRate) is not 1, the size of " + "Input(LearningRate) must be " + "equal to Input(Param), but got the size of Input(LearningRate) " + "is %d, the size of Input(Param) is %d.", + lrs.size(), + n)); + } + if (regularization_methods.size() != 0) { + PADDLE_ENFORCE_EQ( + n, + regularization_methods.size(), + phi::errors::InvalidArgument( + "The size of Attr(regularization_method) must be equal " + "to Input(Param), but got the size of " + "Attr(regularization_method) is %d, the size of Input(Param) is " + "%d.", + regularization_methods.size(), + n)); + PADDLE_ENFORCE_EQ( + n, + regularization_coeffs.size(), + phi::errors::InvalidArgument( + "The size of Attr(regularization_coeff) must be equal " + "to Input(Param), but got the size of Attr(regularization_coeff) " + "is %d, the size of Input(Param) is %d.", + regularization_coeffs.size(), + n)); + } + + VLOG(5) << "use_nesterov: " << use_nesterov + << ", regularization_methods.size(): " + << regularization_methods.size() + << ", regularization_coeffs.size(): " + << regularization_coeffs.size(); + + if (lrs.size() == 1 && use_nesterov == false && + regularization_methods.size() == 0) { +#define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision) \ + MergedMomentumKernelParam kernel_params; \ + constexpr auto kMaxMergedNum = decltype(kernel_params)::N; \ + size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum; \ + kernel_params.mu = static_cast(mu); \ + kernel_params.rescale_grad = static_cast(rescale_grad); \ + kernel_params.lr = lrs[0]->data(); \ + for (size_t i = 0; i < kernel_num; ++i) { \ + size_t start = i * kMaxMergedNum; \ + size_t end = std::min((i + 1) * kMaxMergedNum, n); \ + kernel_params.param_num = static_cast(end - start); \ + size_t max_size = 0; \ + for (size_t j = 0; j < kernel_params.param_num; ++j) { \ + auto size = static_cast(params_out[j + start]->numel()); \ + max_size = std::max(max_size, size); \ + kernel_params.sizes[j] = size; \ + kernel_params.params[j] = params_out[j + start]->data(); \ + kernel_params.grads[j] = grads[j + start]->data(); \ + kernel_params.velocitys[j] = velocitys_out[j + start]->data(); \ + kernel_params.SetMasterParam( \ + j, \ + kMultiPrecision ? master_params_out[j + start]->data() \ + : nullptr); \ + } \ + phi::funcs::ForRange for_range(ctx, max_size); \ + for_range(kernel_params); \ + VLOG(10) << "Launch MergedMomentum kernel " << i << " " \ + << kernel_params.param_num; \ + } + if (multi_precision) { + PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true); + } else { + PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false); + } +#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL + } else { + for (size_t idx = 0; idx < n; idx++) { + phi::RegularizationType regularization_flag = + regularization_methods.size() > 0 && + regularization_methods[idx] == "l2_decay" + ? phi::RegularizationType::kL2DECAY + : phi::RegularizationType::kNONE; + + MT regularization_coeff = static_cast(0.0); + if (regularization_coeffs.size() != 0) { + regularization_coeff = static_cast(regularization_coeffs[idx]); + } + auto lr_temp = lrs.size() > 1 ? lrs[idx] : lrs[0]; + + const MT *master_in_data = + multi_precision ? master_params_opt.get()[idx]->data() : nullptr; + MT *master_out_data = + multi_precision ? master_params_out[idx]->data() : nullptr; + if (paddle::platform::is_cpu_place(ctx.GetPlace())) { + phi::CPUDenseMomentumFunctor functor; + functor(params[idx], + grads[idx], + velocitys[idx], + lr_temp, + static_cast(mu), + use_nesterov, + regularization_flag, + regularization_coeff, + params_out[idx], + velocitys_out[idx]); + VLOG(10) << "Launch MergedMomentum cpu kernel."; + } else if (paddle::platform::is_gpu_place(ctx.GetPlace())) { + phi::funcs::ForRange for_range( + static_cast(ctx), params[idx]->numel()); +#define PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL(__nesterov, __reg_type) \ + phi::DenseMomentumFunctor functor( \ + params[idx]->data(), \ + grads[idx]->data(), \ + velocitys[idx]->data(), \ + lr_temp->data(), \ + master_in_data, \ + static_cast(mu), \ + static_cast(rescale_grad), \ + params[idx]->numel(), \ + regularization_coeff, \ + params_out[idx]->data(), \ + velocitys_out[idx]->data(), \ + master_out_data); \ + for_range(functor); + if (use_nesterov) { + if (regularization_flag == phi::RegularizationType::kL2DECAY) { + PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL( + phi::UseNesterov, phi::RegularizationType::kL2DECAY); + VLOG(10) + << "Launch MergedMomentum gpu kernel use_nesterov kL2DECAY."; + } else { + PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL( + phi::UseNesterov, phi::RegularizationType::kNONE); + VLOG(10) << "Launch MergedMomentum gpu kernel use_nesterov kNONE."; + } + } else { + if (regularization_flag == phi::RegularizationType::kL2DECAY) { + PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL( + phi::NoNesterov, phi::RegularizationType::kL2DECAY); + VLOG(10) + << "Launch MergedMomentum gpu kernel no_nesterov kL2DECAY."; + } else { + PADDLE_LAUNCH_DENSE_MTMOMENTUM_KERNEL( + phi::NoNesterov, phi::RegularizationType::kNONE); + VLOG(10) << "Launch MergedMomentum gpu kernel no_nesterov kNONE."; + } + } + } + } + VLOG(10) + << "Launch MergedMomentum kernel with multi_lr and regularization."; + } +} + +template +void MergedMomentumKernel( + const Context &dev_ctx, + const std::vector ¶m, + const std::vector &grad, + const std::vector &velocity, + const std::vector &learning_rate, + const paddle::optional> &master_param, + float mu, + bool use_nesterov, + const std::vector ®ularization_method, + const std::vector ®ularization_coeff, + bool multi_precision, + float rescale_grad, + std::vector param_out, + std::vector velocity_out, + std::vector master_param_out) { + using MPType = typename phi::dtype::MPTypeTrait::Type; + if (multi_precision) { + MergedMomentumInnerCompute( + dev_ctx, + param, + grad, + velocity, + learning_rate, + master_param, + mu, + use_nesterov, + regularization_method, + regularization_coeff, + rescale_grad, + multi_precision, + param_out, + velocity_out, + master_param_out); + } else { + MergedMomentumInnerCompute(dev_ctx, + param, + grad, + velocity, + learning_rate, + master_param, + mu, + use_nesterov, + regularization_method, + regularization_coeff, + rescale_grad, + multi_precision, + param_out, + velocity_out, + master_param_out); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/merged_momentum_kernel.h b/paddle/phi/kernels/merged_momentum_kernel.h new file mode 100644 index 0000000000000..9f21b988b4bed --- /dev/null +++ b/paddle/phi/kernels/merged_momentum_kernel.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MergedMomentumKernel( + const Context& dev_ctx, + const std::vector& param, + const std::vector& grad, + const std::vector& velocity, + const std::vector& learning_rate, + const paddle::optional>& master_param, + float mu, + bool use_nesterov, + const std::vector& regularization_method, + const std::vector& regularization_coeff, + bool multi_precision, + float rescale_grad, + std::vector param_out, + std::vector velocity_out, + std::vector master_param_out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/merged_momentum_sig.cc b/paddle/phi/ops/compat/merged_momentum_sig.cc new file mode 100644 index 0000000000000..3444d5e2d3097 --- /dev/null +++ b/paddle/phi/ops/compat/merged_momentum_sig.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature MergedMomentumOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "merged_momentum", + {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}, + {"mu", + "use_nesterov", + "regularization_method", + "regularization_coeff", + "multi_precision", + "rescale_grad"}, + { + "ParamOut", + "VelocityOut", + "MasterParamOut", + }); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(merged_momentum, + phi::MergedMomentumOpArgumentMapping); From e8d78a70007f43eb361a8a23a0961bdf4674a634 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Thu, 14 Jul 2022 10:43:55 +0800 Subject: [PATCH 201/250] [AMP] Add amp logic in python_C (#44309) * add amp logic in python_C * fix inplace bug --- .../final_state_generator/python_c_gen.py | 178 ++++++++++++++++-- paddle/fluid/eager/eager_amp_auto_cast.h | 27 ++- 2 files changed, 185 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index d1e7885bae4c1..c6ac5a12f56d3 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -50,6 +50,45 @@ def SkipAPIGeneration(forward_api_name): "paddle::experimental::DataType": "CastPyArg2DataType", } +# This list contains ops that do not need to generate amp logic +# All optimizer ops in this list +no_amp_list = [ + 'adam_', + 'adam', + 'adamw_', + 'adamw', + 'decayed_adagrad_', + 'decayed_adagrad', + 'dgc_momentum_', + 'dgc_momentum', + 'distributed_fused_lamb_', + 'distributed_fused_lamb', + 'dpsgd_', + 'dpsgd', + 'ftrl_', + 'ftrl', + 'lamb_', + 'lamb', + 'lars_momentum_', + 'lars_momentum', + 'merged_adam_', + 'merged_adam', + 'merged_momentum_', + 'merged_momentum', + 'momentum_', + 'momentum', + 'proximal_adagrad_', + 'proximal_adagrad', + 'proximal_gd_', + 'proximal_gd', + 'rmsprop_', + 'rmsprop', + 'sgd_', + 'sgd', + 'sparse_momentum_', + 'sparse_momentum', +] + def FindParsingFunctionFromAttributeType(atype): if atype not in atype_to_parsing_function.keys(): @@ -99,7 +138,7 @@ def FindParsingFunctionFromAttributeType(atype): // Set Device ID {} // Call dygraph function - decltype({}({})) out = {}({}); + {} PyEval_RestoreThread(tstate); tstate = nullptr; @@ -114,6 +153,25 @@ def FindParsingFunctionFromAttributeType(atype): }} """ +NOAMP_DYGRAPH_FUNCTION_TEMPLATE = "decltype({}({})) out = {}({});\n" + +AMP_DYGRAPH_FUNCTION_TEMPLATE = \ +""" + decltype({}({})) out; + // AMP Logic + if (egr::Controller::Instance().GetAMPLevel() != paddle::imperative::AmpLevel::O0) {{ + VLOG(5) << "Check and Prepare For AMP"; + {} + paddle::small_vector, egr::kSlotSmallVectorSize> amp_tensors_vector = {}; + {} + {} + {} + out = {}({}); + }} else {{ + out = {}({}); + }} +""" + FUNCTION_SET_DEVICE_TEMPLATE = \ """{} if (paddle::platform::is_gpu_place(place)) {{ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -149,6 +207,8 @@ def FindParsingFunctionFromAttributeType(atype): #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/pybind/eager_final_state_custom_python_api.h" #include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/eager_amp_auto_cast.h" namespace paddle {{ namespace pybind {{ @@ -335,11 +395,15 @@ def GeneratePythonCFunction(self): num_args = len( forward_inputs_position_map.keys()) + len(orig_forward_attrs_list) dygraph_function_call_list = ["" for i in range(num_args)] + amp_dygraph_function_call_list = ["" for i in range(num_args)] for name, (_, pos) in forward_inputs_position_map.items(): dygraph_function_call_list[pos] = f"{name}" + amp_dygraph_function_call_list[pos] = f"NEW_{name}" for name, _, _, pos in orig_forward_attrs_list: dygraph_function_call_list[pos] = f"{name}" + amp_dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_str = ",".join(dygraph_function_call_list) + amp_dygraph_function_call_str = ",".join(amp_dygraph_function_call_list) # Generate Python-C Function Definitions if is_forward_only: @@ -355,12 +419,82 @@ def GeneratePythonCFunction(self): pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format( "pythonc_record_event", forward_api_name, "pybind_imperative_func") - # Generate Python-C Function Definetion - self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( - forward_api_name, pythonc_record_event_str, forward_api_name, - get_eager_tensor_str, parse_attributes_str, set_device_str, + # Forward amp logic + amp_tensors_vector_list = [] + amp_tensors_vector_optional_list = [] + amp_autocast_list = [] + amp_autocast_optional_list = [] + + for name, (ttype, pos) in forward_inputs_position_map.items(): + is_optional = (name in optional_inputs) + if IsVectorTensorType(ttype): + if is_optional: + amp_tensors_vector_optional_list.append( + f"if ({name}.is_initialized()) amp_tensors_vector.push_back({name}.get());\n" + ) + amp_autocast_optional_list.append( + f"auto NEW_{name} = {name}.is_initialized() ? egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name, false) : {name};\n" + ) + else: + amp_tensors_vector_list.append(f"{name}") + amp_autocast_list.append( + f"auto NEW_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name, false);\n" + ) + else: + if is_optional: + amp_tensors_vector_optional_list.append( + f"if ({name}.is_initialized()) amp_tensors_vector.push_back({{{name}.get()}});\n" + ) + amp_autocast_optional_list.append( + f"auto NEW_{name} = {name}.is_initialized() ? egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name, false) : {name};\n" + ) + else: + if forward_inplace_map and name in forward_inplace_map.keys( + ): + amp_tensors_vector_list.append(f"{{{name}}}") + amp_autocast_list.append( + f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name, false);\n" + ) + else: + amp_tensors_vector_list.append(f"{{{name}}}") + amp_autocast_list.append( + f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name, false);\n" + ) + amp_tensors_vector_list_str = "{ " + ",".join( + amp_tensors_vector_list) + " }" + amp_tensors_vector_optional_list_str = "".join( + amp_tensors_vector_optional_list) + amp_autocast_list_str = " ".join( + amp_autocast_list) + " " + " ".join( + amp_autocast_optional_list) + + kernel_trans2_op_name_str = f"auto op_name = phi::TransToFluidOpName(\"{forward_api_name}\");" + amp_get_dst_dtype_str = f"auto amp_dst_dtype = egr::GetAmpDestDtype(op_name, amp_tensors_vector);\n" + + noamp_dygraph_function_str = NOAMP_DYGRAPH_FUNCTION_TEMPLATE.format( fwd_function_name, dygraph_function_call_str, fwd_function_name, - dygraph_function_call_str, return_str) + dygraph_function_call_str) + + amp_dygraph_function_str = AMP_DYGRAPH_FUNCTION_TEMPLATE.format( + fwd_function_name, dygraph_function_call_str, + kernel_trans2_op_name_str, amp_tensors_vector_list_str, + amp_tensors_vector_optional_list_str, amp_get_dst_dtype_str, + amp_autocast_list_str, fwd_function_name, + amp_dygraph_function_call_str, fwd_function_name, + dygraph_function_call_str) + + # Generate Python-C Function Definetion + if (is_forward_only) and (len(amp_tensors_vector_list) > + 0) and (forward_api_name not in no_amp_list): + self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( + forward_api_name, pythonc_record_event_str, forward_api_name, + get_eager_tensor_str, parse_attributes_str, set_device_str, + amp_dygraph_function_str, return_str) + else: + self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( + forward_api_name, pythonc_record_event_str, forward_api_name, + get_eager_tensor_str, parse_attributes_str, set_device_str, + noamp_dygraph_function_str, return_str) # Set prefix of forward_api_name to avoid conflicts prefix = self.namespace.strip("::") @@ -383,6 +517,18 @@ def GeneratePythonCFunction(self): "::", namespace, GetForwardFunctionName(inplaced_forward_api_name)) + inplace_noamp_dygraph_function_str = NOAMP_DYGRAPH_FUNCTION_TEMPLATE.format( + inplaced_fwd_function_name, dygraph_function_call_str, + inplaced_fwd_function_name, dygraph_function_call_str) + + inplace_amp_dygraph_function_str = AMP_DYGRAPH_FUNCTION_TEMPLATE.format( + inplaced_fwd_function_name, dygraph_function_call_str, + kernel_trans2_op_name_str, amp_tensors_vector_list_str, + amp_tensors_vector_optional_list_str, amp_get_dst_dtype_str, + amp_autocast_list_str, inplaced_fwd_function_name, + amp_dygraph_function_call_str, inplaced_fwd_function_name, + dygraph_function_call_str) + return_str = " std::map inplace_var_idx_map;" for inplace_input, inplace_output in forward_inplace_map.items(): return_str += RETURN_INPLACE_PYOBJECT_TEMPLATE.format( @@ -391,13 +537,19 @@ def GeneratePythonCFunction(self): return_str += " return ToPyObject(out, args, inplace_var_idx_map);" # Generate Python-C Function Definetion - python_c_inplace_func_str = PYTHON_C_FUNCTION_TEMPLATE.format( - inplaced_forward_api_name, pythonc_record_event_str, - inplaced_forward_api_name, get_eager_tensor_str, - parse_attributes_str, set_device_str, - inplaced_fwd_function_name, dygraph_function_call_str, - inplaced_fwd_function_name, dygraph_function_call_str, - return_str) + if (is_forward_only) and (len(amp_tensors_vector_list) > 0) and ( + inplaced_forward_api_name not in no_amp_list): + python_c_inplace_func_str = PYTHON_C_FUNCTION_TEMPLATE.format( + inplaced_forward_api_name, pythonc_record_event_str, + inplaced_forward_api_name, get_eager_tensor_str, + parse_attributes_str, set_device_str, + inplace_amp_dygraph_function_str, return_str) + else: + python_c_inplace_func_str = PYTHON_C_FUNCTION_TEMPLATE.format( + inplaced_forward_api_name, pythonc_record_event_str, + inplaced_forward_api_name, get_eager_tensor_str, + parse_attributes_str, set_device_str, + inplace_noamp_dygraph_function_str, return_str) python_c_inplace_func_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format( forward_api_name_prefix, inplaced_forward_api_name, namespace, diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h index 438ccbaca8a5e..26af2b98ca0ab 100644 --- a/paddle/fluid/eager/eager_amp_auto_cast.h +++ b/paddle/fluid/eager/eager_amp_auto_cast.h @@ -43,15 +43,21 @@ inline std::vector EagerAmpAutoCasts( const std::string& inputs_name, const std::vector& inputs, const paddle::experimental::DataType& dst_dtype, - std::string op_name) { + std::string op_name, + bool trace_backward = true) { VLOG(6) << "AMP AmpAutoCasts:" << " inputs(" << inputs_name << ") dst_dtype(" << paddle::framework::DataType2String(dst_dtype) << ")."; std::vector inputs_casted; for (auto& input : inputs) { if (NeedCast(input, dst_dtype)) { - inputs_casted.emplace_back( - std::move(cast_final_state_dygraph_function(input, dst_dtype))); + if (trace_backward) { + inputs_casted.emplace_back( + std::move(cast_final_state_dygraph_function(input, dst_dtype))); + } else { + inputs_casted.emplace_back( + std::move(paddle::experimental::cast(input, dst_dtype))); + } } else { inputs_casted.emplace_back(input); } @@ -63,7 +69,8 @@ inline paddle::experimental::Tensor EagerAmpAutoCast( const std::string& input_name, const paddle::experimental::Tensor& input, const paddle::experimental::DataType& dst_dtype, - const std::string& op_name) { + const std::string& op_name, + bool trace_backward = true) { VLOG(6) << "AMP AmpAutoCasts:" << " input(" << input_name << ") dst_dtype(" << paddle::framework::DataType2String(dst_dtype) << ")."; @@ -85,7 +92,11 @@ inline paddle::experimental::Tensor EagerAmpAutoCast( } } if (NeedCast(input, dst_dtype)) { - return cast_final_state_dygraph_function(input, dst_dtype); + if (trace_backward) { + return cast_final_state_dygraph_function(input, dst_dtype); + } else { + return paddle::experimental::cast(input, dst_dtype); + } } return input; } @@ -94,9 +105,11 @@ inline paddle::optional EagerAmpAutoCast( const std::string& input_name, const paddle::optional& input, const paddle::experimental::DataType& dst_dtype, - const std::string& op_name) { + const std::string& op_name, + bool trace_backward = true) { if (input) { - return EagerAmpAutoCast(input_name, *input, dst_dtype, op_name); + return EagerAmpAutoCast( + input_name, *input, dst_dtype, op_name, trace_backward); } return paddle::none; } From f7ecca459ab2ce9dd5b5a82ea41783f652bb2372 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 14 Jul 2022 11:13:19 +0800 Subject: [PATCH 202/250] supoort set original op_name for api (#44317) --- paddle/fluid/operators/diag_v2_op.cc | 117 ------------------ paddle/phi/api/yaml/api.yaml | 9 ++ paddle/phi/api/yaml/api_compat.yaml | 8 ++ paddle/phi/api/yaml/backward.yaml | 12 ++ paddle/phi/api/yaml/generator/generate_op.py | 65 ++++++---- .../phi/api/yaml/generator/templates/ks.c.j2 | 5 +- .../generator/templates/operator_utils.c.j2 | 21 ++-- paddle/phi/api/yaml/legacy_api.yaml | 8 -- paddle/phi/kernels/diag_kernel.h | 20 +++ paddle/phi/ops/compat/diag_sig.cc | 34 ----- 10 files changed, 103 insertions(+), 196 deletions(-) delete mode 100644 paddle/fluid/operators/diag_v2_op.cc delete mode 100644 paddle/phi/ops/compat/diag_sig.cc diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc deleted file mode 100644 index 61a3409c418ba..0000000000000 --- a/paddle/fluid/operators/diag_v2_op.cc +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -class DiagV2Op : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "The input tensor. Its shape is either 1-D or 2-D."); - AddOutput("Out", "The output tensor. A square matrix or a vector."); - AddAttr("offset", - "The diagonal offset. A positive value represents " - "superdiagonal, 0 represents the main diagonal, and a " - "negative value represents subdiagonal.") - .SetDefault(0); - AddAttr("padding_value", - "Use this value to fill the area outside the specified " - "diagonal band. Only takes effect when the input is a 1-D " - "Tensor. The default value is 0.") - .SetDefault(0.0f); - AddComment(R"DOC( - If ``x`` is a vector (1-D tensor), a 2-D square tensor with the elements of ``x`` as the diagonal is returned. - - If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal elements of ``x`` is returned. - - The argument ``offset`` controls the diagonal offset: - - If ``offset`` = 0, it is the main diagonal. - - If ``offset`` > 0, it is superdiagonal. - - If ``offset`` < 0, it is subdiagonal. -)DOC"); - } -}; - -class DiagV2GradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "X", "X", "DiagV2Grad"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), - "Output", - framework::GradVarName("X"), - "DiagV2Grad"); - - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); - } -}; - -template -class DiagV2GradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("diag_v2_grad"); - grad_op->SetInput("X", this->Input("X")); - grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); - grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - grad_op->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagGradV2NoNeedBufferVarsInferer, "X"); - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, - DiagInferShapeFunctor, - PD_INFER_META(phi::DiagInferMeta)); - -REGISTER_OPERATOR(diag_v2, - ops::DiagV2Op, - ops::DiagV2OpMaker, - ops::DiagV2GradOpMaker, - ops::DiagV2GradOpMaker, - DiagInferShapeFunctor); - -REGISTER_OPERATOR(diag_v2_grad, - ops::DiagV2GradOp, - ops::DiagGradV2NoNeedBufferVarsInferer); diff --git a/paddle/phi/api/yaml/api.yaml b/paddle/phi/api/yaml/api.yaml index 0f86c93d9314e..b5703aa57f9da 100644 --- a/paddle/phi/api/yaml/api.yaml +++ b/paddle/phi/api/yaml/api.yaml @@ -43,6 +43,15 @@ data_type : x backward : cross_grad +- api : diag + args : (Tensor x, int offset = 0, float padding_value = 0.0) + output : Tensor + infer_meta : + func : DiagInferMeta + kernel : + func : diag + backward : diag_grad + - api : diagonal args : (Tensor x, int offset = 0, int axis1 = 0, int axis2 = 1) output : Tensor diff --git a/paddle/phi/api/yaml/api_compat.yaml b/paddle/phi/api/yaml/api_compat.yaml index 987876d703928..873d735c5df83 100644 --- a/paddle/phi/api/yaml/api_compat.yaml +++ b/paddle/phi/api/yaml/api_compat.yaml @@ -12,6 +12,14 @@ outputs : out : Out +- api : diag + op_name : diag_v2 + grad_op_name : diag_v2_grad + inputs : + x : X + outputs : + out : Out + - api : diagonal inputs : x : Input diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 32906ce382742..17409f8ae7984 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -39,6 +39,18 @@ func : cross_grad data_type : out_grad +- backward_api : diag_grad + forward : diag (Tensor x, int offset, float padding_value) -> Tensor(out) + args : (Tensor x, Tensor out_grad, int offset) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : diag_grad + data_type : out_grad + no_need_buffer : x + - backward_api : diagonal_grad forward : diagonal (Tensor x, int offset, int axis1, int axis2) -> Tensor(out) args : (Tensor x, Tensor out_grad, int offset = 0, int axis1 = 0, int axis2 = 1) diff --git a/paddle/phi/api/yaml/generator/generate_op.py b/paddle/phi/api/yaml/generator/generate_op.py index e70042fb9d033..bc8b80efb5156 100644 --- a/paddle/phi/api/yaml/generator/generate_op.py +++ b/paddle/phi/api/yaml/generator/generate_op.py @@ -54,34 +54,21 @@ def restruct_io(api): return api -def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path, - api_version_yaml_path, output_op_path, output_arg_map_path): - with open(api_yaml_path, "rt") as f: - apis = yaml.safe_load(f) - apis = [restruct_io(api) for api in apis] - forward_api_dict = to_named_dict(apis) - - with open(backward_yaml_path, "rt") as f: - backward_apis = yaml.safe_load(f) - backward_apis = [restruct_io(api) for api in backward_apis] - backward_api_dict = to_named_dict(backward_apis) - - with open(api_version_yaml_path, "rt") as f: - api_versions = yaml.safe_load(f) - # add api version info into api - for api_version in api_versions: - forward_api_dict[api_version['api']]['version'] = api_version['version'] - - with open(api_compat_yaml_path, "rt") as f: - api_args_map = yaml.safe_load(f) - # replace args name for OpMaker - for api_args in api_args_map: +# replace name of op and params for OpMaker +def replace_compat_name(api_op_map, forward_api_dict, backward_api_dict): + for api_args in api_op_map: if api_args['api'] not in forward_api_dict: continue forward_api_item = forward_api_dict[api_args['api']] has_backward = True if forward_api_item['backward'] else False if has_backward: backward_api_item = backward_api_dict[forward_api_item['backward']] + if 'op_name' in api_args: + forward_api_item['op_name'] = api_args['op_name'] + if 'grad_op_name' in api_args and has_backward: + forward_api_item['backward'] = api_args['grad_op_name'] + backward_api_item['op_name'] = api_args['grad_op_name'] + key_set = ['inputs', 'attrs', 'outputs'] args_map = {} for key in key_set: @@ -175,6 +162,35 @@ def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path, for param in backward_api_item['no_need_buffer'] ] + +def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path, + api_version_yaml_path, output_op_path, output_arg_map_path): + with open(api_yaml_path, "rt") as f: + apis = yaml.safe_load(f) + apis = [restruct_io(api) for api in apis] + forward_api_dict = to_named_dict(apis) + + with open(backward_yaml_path, "rt") as f: + backward_apis = yaml.safe_load(f) + backward_apis = [restruct_io(api) for api in backward_apis] + backward_api_dict = to_named_dict(backward_apis) + + with open(api_version_yaml_path, "rt") as f: + api_versions = yaml.safe_load(f) + # add api version info into api + for api_version in api_versions: + forward_api_dict[api_version['api']]['version'] = api_version['version'] + + with open(api_compat_yaml_path, "rt") as f: + api_op_map = yaml.safe_load(f) + + for api in apis: + api['op_name'] = api['name'] + for bw_api in backward_apis: + bw_api['op_name'] = bw_api['name'] + + replace_compat_name(api_op_map, forward_api_dict, backward_api_dict) + # fill backward field for an api if another api claims it as forward for name, backward_api in backward_api_dict.items(): forward_name = backward_api["forward"]["name"] @@ -183,11 +199,6 @@ def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path, if forward_api["backward"] is None: forward_api["backward"] = name - if forward_name in backward_api_dict: - forward_api = backward_api_dict[forward_name] - if forward_api["backward"] is None: - forward_api["backward"] = name - api_dict = {} api_dict.update(forward_api_dict) api_dict.update(backward_api_dict) diff --git a/paddle/phi/api/yaml/generator/templates/ks.c.j2 b/paddle/phi/api/yaml/generator/templates/ks.c.j2 index 2855e05b3ca53..0ff6d91fc20ca 100644 --- a/paddle/phi/api/yaml/generator/templates/ks.c.j2 +++ b/paddle/phi/api/yaml/generator/templates/ks.c.j2 @@ -1,4 +1,4 @@ -{% from "operator_utils.c.j2" import name_map, register_name_map %} +{% from "operator_utils.c.j2" import name_map, register_name_map, register_base_kernel_name %} // this file is generated by paddle/phi/api/yaml/generator/generate_op.py, do not edit. #include "paddle/phi/core/compat/op_utils.h" #include "paddle/utils/small_vector.h" @@ -18,6 +18,9 @@ namespace phi { } // namespace phi {% for api in apis + backward_apis %} + {% if api["name"] != api["op_name"] %} +{{register_base_kernel_name(api)}} + {% endif %} {% if api is base_api %} {{register_name_map(api)}} {% endif %} diff --git a/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 index 9a593a99c1df0..841de704579d3 100644 --- a/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 +++ b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 @@ -1,6 +1,6 @@ {# ----------------------------- op maker ----------------------------------- #} {% macro op_maker(api) %} - {% set api_name = api["name"] %} + {% set api_name = api["op_name"] %} class {{api_name | to_pascal_case}}OpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -124,9 +124,12 @@ All possible KernelSignatures returned by {{api["name"] | to_pascal_case }}OpArg */ {% endmacro %} +{% macro register_base_kernel_name(api) %} +PD_REGISTER_BASE_KERNEL_NAME({{api["op_name"]}}, {{api["name"]}}); +{%- endmacro %} {% macro register_name_map(api) %} -PD_REGISTER_ARG_MAPPING_FN({{api["name"]}}, phi::{{api["name"] | to_pascal_case}}OpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN({{api["op_name"]}}, phi::{{api["name"] | to_pascal_case}}OpArgumentMapping); {%- endmacro %} {% macro get_input_list(inputs, kernel_args) %}{# inline #} @@ -196,7 +199,7 @@ framework::OpKernelType GetExpectedKernelType( {# --------------------------------------- operator ---------------------------------------------- #} {% macro operator(api) %} -class {{api["name"] | to_pascal_case}}Op : public framework::OperatorWithKernel { +class {{api["op_name"] | to_pascal_case}}Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; {# ----------- get expected kernel type function -------------------------- #} @@ -209,7 +212,7 @@ class {{api["name"] | to_pascal_case}}Op : public framework::OperatorWithKernel {% endif %} }; -DECLARE_INFER_SHAPE_FUNCTOR({{api["name"]}}, {{api["name"] | to_pascal_case}}InferShapeFunctor, +DECLARE_INFER_SHAPE_FUNCTOR({{api["op_name"]}}, {{api["op_name"] | to_pascal_case}}InferShapeFunctor, PD_INFER_META(phi::{{api["infer_meta"]["func"]}})); {# inplace inferer #} {% if api["inplace"] is not none %} @@ -218,19 +221,19 @@ DECLARE_INFER_SHAPE_FUNCTOR({{api["name"]}}, {{api["name"] | to_pascal_case}}Inf {{"{"}}{{source | to_opmaker_name}}, {{target | to_opmaker_name}}{{"}"}}{{", " if not loop.last}} {%- endfor %} {%- endset %} -DECLARE_INPLACE_OP_INFERER({{api["name"] | to_pascal_case}}InplaceInferer, +DECLARE_INPLACE_OP_INFERER({{api["op_name"] | to_pascal_case}}InplaceInferer, {{inplace_map}}); {% endif %} {# no_need_buffer inferer #} {% if api["no_need_buffer"] is not none %} -DECLARE_NO_NEED_BUFFER_VARS_INFERER({{api["name"] | to_pascal_case}}NoNeedBufferVarInferer, +DECLARE_NO_NEED_BUFFER_VARS_INFERER({{api["op_name"] | to_pascal_case}}NoNeedBufferVarInferer, {{api["no_need_buffer"] | map("to_opmaker_name") | join(", ")}}); {% endif %} {% endmacro%} {% macro register_op_with_components(api) %} -{% set name = api["name"] %} +{% set name = api["op_name"] %} REGISTER_OPERATOR({{name}}, ops::{{name | to_pascal_case}}Op, {% if not "forward" in api %}{# it is a forward api #} ops::{{name | to_pascal_case}}OpMaker, @@ -254,7 +257,7 @@ REGISTER_OPERATOR({{name}}, ops::{{name | to_pascal_case}}Op, {% macro register_op_version(api) %} {% if "version" in api %} -{% set name = api["name"] %} +{% set name = api["op_name"] %} REGISTER_OP_VERSION({{name}}) {% for checkpoint in api["version"]%} .AddCheckpoint( @@ -296,7 +299,7 @@ REGISTER_OP_VERSION({{name}}) {# --------------------------------------- backward op maker ---------------------------------------------- #} {% macro backward_op_maker(api, forward_api) %} - {% set name = api["name"] %} + {% set name = api["op_name"] %} {% set forward_input_names = api["forward"]["inputs"] | map(attribute="name") | list %} {% set forward_output_names = api["forward"]["outputs"] | map(attribute="name") | list %} {% set forward_attr_names = api["forward"]["attrs"] | map(attribute="name") | list %} diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index 3dad0b96ae758..ab82ce9473e3c 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -498,14 +498,6 @@ func : determinant backward : det_grad -- api : diag - args : (Tensor x, int offset, float padding_value) - output : Tensor - infer_meta : - func : DiagInferMeta - kernel : - func : diag - - api : divide args : (Tensor x, Tensor y) output : Tensor diff --git a/paddle/phi/kernels/diag_kernel.h b/paddle/phi/kernels/diag_kernel.h index 8dc919fa63360..3168aea54e697 100644 --- a/paddle/phi/kernels/diag_kernel.h +++ b/paddle/phi/kernels/diag_kernel.h @@ -18,6 +18,26 @@ namespace phi { +/** + * @brief If ``x`` is a vector (1-D tensor), a 2-D square tensor with the + * elements of ``x`` as the diagonal is returned. + * If ``x`` is a matrix (2-D tensor), a 1-D tensor with the diagonal + * elements of ``x`` is returned. + * + * The argument ``offset`` controls the diagonal offset: + * If ``offset`` = 0, it is the main diagonal. + * If ``offset`` > 0, it is superdiagonal. If ``offset`` < 0, + * it is subdiagonal. + * @param ctx device context + * @param x The input tensor. Its shape is either 1-D or 2-D. + * @param offset The diagonal offset. A positive value represents + * superdiagonal, 0 represents the main diagonal, and a + * negative value represents subdiagonal. + * @param padding_value Use this value to fill the area outside the specified + * diagonal band. Only takes effect when the input is a + * 1-D Tensor. The default value is 0. + * @param out The output tensor. A square matrix or a vector. + */ template void DiagKernel(const Context& dev_ctx, const DenseTensor& x, diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/diag_sig.cc deleted file mode 100644 index b232c714c9710..0000000000000 --- a/paddle/phi/ops/compat/diag_sig.cc +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"}); -} - -KernelSignature DiagGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "diag_grad", {"X", "Out@GRAD"}, {"offset"}, {"X@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag); -PD_REGISTER_BASE_KERNEL_NAME(diag_v2_grad, diag_grad); - -PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(diag_v2_grad, phi::DiagGradOpArgumentMapping); From b7287d2b7fb991ff64d1dc93e696775c1abbd78f Mon Sep 17 00:00:00 2001 From: Chen Long <1300851984@qq.com> Date: Thu, 14 Jul 2022 12:29:25 +0800 Subject: [PATCH 203/250] rm TCChenlong from ci approval list;test=document_fix (#44322) --- tools/check_api_approvals.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index 815201469e89a..87edff50ef85e 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -43,22 +43,22 @@ api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/flu if [ "$api_spec_diff" != "" -o "${api_params_diff}" != "" ]; then echo_line="You must have one RD (XiaoguangHu01, lanxianghit or Superjomn) approval for API change.\n" echo_line="${echo_line} and one TPM approval for API change: \n" - echo_line="${echo_line} jzhang533/ZhangJun, momozi1996/MoYan, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general APIs.\n" + echo_line="${echo_line} jzhang533/ZhangJun, momozi1996/MoYan, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general APIs.\n" echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related APIs.\n" echo_line="${echo_line} leiqing1/LeiQing for inference related APIs.\n" check_approval 1 46782768 47554610 328693 - check_approval 1 29231 79295425 23093488 11935832 39876205 65896652 54695910 + check_approval 1 29231 79295425 23093488 39876205 65896652 54695910 fi api_doc_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.doc ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.doc` if [ "$api_doc_spec_diff" != "" ]; then echo_line="You must have one TPM approval for API documents change: \n" - echo_line="${echo_line} jzhang533/ZhangJun, momozi1996/MoYan, dingjiaweiww/DingJiaWei, TCChenlong/ChenLong, Ligoml/LiMengLiu for general API docs.\n" + echo_line="${echo_line} jzhang533/ZhangJun, momozi1996/MoYan, dingjiaweiww/DingJiaWei, Ligoml/LiMengLiu for general API docs.\n" echo_line="${echo_line} liuTINA0907/LiuShuangQiao for distributed related API docs.\n" echo_line="${echo_line} leiqing1/LeiQing for inference related API docs.\n" - check_approval 1 29231 79295425 23093488 11935832 39876205 65896652 54695910 + check_approval 1 29231 79295425 23093488 39876205 65896652 54695910 fi api_src_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5 ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` From c446ab7b75bc265fc16086e001d5cb024e827839 Mon Sep 17 00:00:00 2001 From: zhangyikun02 <48021248+zhangyk0314@users.noreply.github.com> Date: Thu, 14 Jul 2022 13:50:03 +0800 Subject: [PATCH 204/250] bugfix for conv_op_xpu in NHWC data_formate and update xpu.cmake, test=kunlun (#44296) --- cmake/external/xpu.cmake | 4 +- paddle/fluid/operators/conv_op_xpu.cc | 71 +++++++++++++++---- .../tests/unittests/xpu/test_conv2d_op_xpu.py | 35 +++++++-- 3 files changed, 91 insertions(+), 19 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 25d01912f1419..3ead16451a3af 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220708") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220712") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() @@ -19,7 +19,7 @@ endif() if(NOT DEFINED XPU_XDNN_BASE_URL) set(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220708") + set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220712") else() set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() diff --git a/paddle/fluid/operators/conv_op_xpu.cc b/paddle/fluid/operators/conv_op_xpu.cc index f65921dbc1776..638983ea26be9 100644 --- a/paddle/fluid/operators/conv_op_xpu.cc +++ b/paddle/fluid/operators/conv_op_xpu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #ifdef PADDLE_WITH_XPU namespace paddle { namespace operators { @@ -71,9 +72,26 @@ class GemmConvXPUKernel : public framework::OpKernel { XPUT *output_data = reinterpret_cast(output->data()); auto &dev_ctx = context.template device_context(); + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + + XPUT *filter_data_tmp; + const XPUT *filter_data_ptr = filter_data; + if (data_format == "NHWC") { + filter_data_tmp = RAII_GUARD.alloc(filter.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(filter_data_tmp); + std::vector filter_shape = phi::vectorize(filter.dims()); + int r = xpu::transpose(dev_ctx.x_context(), + filter_data, + filter_data_tmp, + filter_shape, + {0, 2, 3, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + filter_data_ptr = reinterpret_cast(filter_data_tmp); + } + int r = xpu::conv2d(dev_ctx.x_context(), input_data, - filter_data, + filter_data_ptr, output_data, batch_size, img_c, @@ -89,11 +107,7 @@ class GemmConvXPUKernel : public framework::OpKernel { nullptr, nullptr, is_nchw); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU conv kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d"); } }; @@ -134,6 +148,7 @@ class GemmConvGradXPUKernel : public framework::OpKernel { framework::DDim filter_data_dims = phi::slice_ddim(filter.dims(), 2, filter.dims().size()); std::vector ksize = phi::vectorize(filter_data_dims); + std::vector filter_shape = phi::vectorize(filter.dims()); UpdatePaddingAndDilation( &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize); @@ -165,12 +180,35 @@ class GemmConvGradXPUKernel : public framework::OpKernel { filter_grad_data = reinterpret_cast(filter_grad->data()); } auto &dev_ctx = context.template device_context(); + xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); + + XPUT *filter_data_tmp; + XPUT *filter_grad_data_tmp; + const XPUT *filter_data_ptr = filter_data; + XPUT *filter_grad_data_ptr = filter_grad_data; + if (data_format == "NHWC") { + filter_data_tmp = RAII_GUARD.alloc(filter.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(filter_data_tmp); + int r = xpu::transpose(dev_ctx.x_context(), + filter_data, + filter_data_tmp, + filter_shape, + {0, 2, 3, 1}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + filter_data_ptr = reinterpret_cast(filter_data_tmp); + + if (filter_grad_data != nullptr) { + filter_grad_data_tmp = RAII_GUARD.alloc(filter.numel()); + PADDLE_ENFORCE_XDNN_NOT_NULL(filter_grad_data_tmp); + filter_grad_data_ptr = filter_grad_data_tmp; + } + } int r = xpu::conv2d_grad(dev_ctx.x_context(), input_data, - filter_data, + filter_data_ptr, output_grad_data, input_grad_data, - filter_grad_data, + filter_grad_data_ptr, batch_size, img_c, img_h, @@ -187,11 +225,18 @@ class GemmConvGradXPUKernel : public framework::OpKernel { nullptr, nullptr, is_nchw); - PADDLE_ENFORCE_EQ( - r, - XPU_SUCCESS, - platform::errors::External( - "XPU conv kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_grad"); + + if ((filter_grad_data_ptr != nullptr) && (data_format == "NHWC")) { + std::vector filter_shape_fhwc = { + filter_shape[0], filter_shape[2], filter_shape[3], filter_shape[1]}; + int r = xpu::transpose(dev_ctx.x_context(), + filter_grad_data_ptr, + filter_grad_data, + filter_shape_fhwc, + {0, 3, 1, 2}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "transpose"); + } } }; } // namespace operators diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py index 387dd88bcd4ea..9dd7247c4a39d 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py @@ -498,10 +498,41 @@ def init_paddings(self): self.padding_algorithm = "EXPLICIT" +class XPUTestConv2DOp_NHWC(XPUOpTestWrapper): + + def __init__(self): + self.op_name = 'conv2d' + self.use_dynamic_create_class = False + + class TestConv2DOp_AsyPadding_NHWC( + XPUTestConv2DOp_v2.TestConv2DOp_AsyPadding): + + def init_data_format(self): + self.data_format = "NHWC" + + def init_test_case_2(self): + N, C, H, W = self.input_size + self.input_size = [N, H, W, C] + + class TestWithPad_AsyPadding_NHWC(XPUTestConv2DOp_v2.TestWithPad_AsyPadding + ): + + def init_data_format(self): + self.data_format = "NHWC" + + def init_test_case_2(self): + N, C, H, W = self.input_size + self.input_size = [N, H, W, C] + + support_types = get_xpu_op_support_types('conv2d') for stype in ['float32']: create_test_class(globals(), XPUTestConv2DOp, stype) create_test_class(globals(), XPUTestConv2DOp_v2, stype) + create_test_class(globals(), + XPUTestConv2DOp_NHWC, + stype, + ignore_deivce_version=[core.XPUVersion.XPU1]) #---------- test SAME VALID ----------- #create_test_padding_SAME_class(TestConv2DOp_AsyPadding) @@ -512,9 +543,5 @@ def init_paddings(self): #create_test_padding_VALID_class(TestWithPad_AsyPadding) #create_test_padding_VALID_class(TestWithStride_AsyPadding) -# ------------ test channel last --------- -#create_test_channel_last_class(TestConv2DOp_AsyPadding) -#create_test_channel_last_class(TestWithPad_AsyPadding) - if __name__ == '__main__': unittest.main() From d88e77a7b4acfecb58d8eefcfc994a56f94b301a Mon Sep 17 00:00:00 2001 From: ronnywang Date: Thu, 14 Jul 2022 14:39:37 +0800 Subject: [PATCH 205/250] [CustomDevice] add custom ccl 1/2 (#44294) * [CustomDevice] add custom ccl api * add ut --- .../final_state_generator/python_c_gen.py | 10 + .../allocation/naive_best_fit_allocator.cc | 2 +- paddle/fluid/platform/init.cc | 8 +- paddle/phi/backends/c_comm_lib.cc | 20 ++ paddle/phi/backends/c_comm_lib.h | 60 ++++ paddle/phi/backends/callback_manager.cc | 4 +- paddle/phi/backends/custom/custom_device.cc | 329 +++++++++++++++--- .../phi/backends/custom/custom_device_test.cc | 72 ++++ paddle/phi/backends/custom/fake_cpu_device.h | 88 +++++ paddle/phi/backends/device_base.cc | 87 +++++ paddle/phi/backends/device_base.h | 60 ++++ paddle/phi/backends/device_ext.h | 105 ++++++ paddle/phi/backends/device_manager.cc | 134 +++++++ paddle/phi/backends/device_manager.h | 69 ++++ python/paddle/device/__init__.py | 5 +- 15 files changed, 994 insertions(+), 59 deletions(-) create mode 100644 paddle/phi/backends/c_comm_lib.cc create mode 100644 paddle/phi/backends/c_comm_lib.h diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index c6ac5a12f56d3..9d5706f65bdf0 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -180,6 +180,15 @@ def FindParsingFunctionFromAttributeType(atype): #else PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU if use CUDAPlace.")); +#endif + }} + if (paddle::platform::is_custom_place(place)) {{ +#if defined(PADDLE_WITH_CUSTOM_DEVICE) + phi::DeviceManager::SetDevice(place); + VLOG(1) <<"CurrentDeviceId: " << phi::DeviceManager::GetDevice(place.GetDeviceType()) << " from " << (int)place.device; +#else + PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with CUSTOM_DEVICE if use CustomPlace.")); #endif }} """ @@ -200,6 +209,7 @@ def FindParsingFunctionFromAttributeType(atype): #include #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/api/include/strings_api.h" +#include "paddle/phi/backends/device_manager.h" #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/platform/profiler/event_tracing.h" diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 4553c80e74c59..d696b8bffda08 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -764,7 +764,7 @@ class BuddyAllocatorList { private: explicit BuddyAllocatorList(const std::string &device_type) : device_type_(device_type) { - auto devices = phi::DeviceManager::GetDeviceList(device_type); + auto devices = phi::DeviceManager::GetSelectedDeviceList(device_type); for (auto dev_id : devices) { init_flags_[dev_id].reset(new std::once_flag()); } diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index b6f6deb80d67b..6e28c775a38bb 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -264,11 +264,11 @@ void InitDevices(const std::vector devices) { auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (auto &dev_type : device_types) { - auto device_count = phi::DeviceManager::GetDeviceCount(dev_type); + auto device_list = phi::DeviceManager::GetSelectedDeviceList(dev_type); LOG(INFO) << "CustomDevice: " << dev_type - << ", visible devices count: " << device_count; - for (size_t i = 0; i < device_count; i++) { - places.push_back(platform::CustomPlace(dev_type, i)); + << ", visible devices count: " << device_list.size(); + for (auto &dev_id : device_list) { + places.push_back(platform::CustomPlace(dev_type, dev_id)); } } } else { diff --git a/paddle/phi/backends/c_comm_lib.cc b/paddle/phi/backends/c_comm_lib.cc new file mode 100644 index 0000000000000..7f86ac6eff91f --- /dev/null +++ b/paddle/phi/backends/c_comm_lib.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/c_comm_lib.h" + +namespace phi { +// Even this source file does not contains any code, it is better to keep this +// source file for cmake dependency. +} // namespace phi diff --git a/paddle/phi/backends/c_comm_lib.h b/paddle/phi/backends/c_comm_lib.h new file mode 100644 index 0000000000000..f2987996cbe58 --- /dev/null +++ b/paddle/phi/backends/c_comm_lib.h @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/errors.h" +#include "paddle/phi/core/macros.h" + +namespace phi { +namespace ccl { +using CCLComm = void*; +using CCLRootId = std::vector; + +enum CCLReduceOp { SUM = 0, AVG, MAX, MIN, PRODUCT }; +enum CCLDataType { + CCL_DATA_TYPE_FP64 = 0, + CCL_DATA_TYPE_FP32, + CCL_DATA_TYPE_FP16, + CCL_DATA_TYPE_INT64, + CCL_DATA_TYPE_INT32, + CCL_DATA_TYPE_INT16, + CCL_DATA_TYPE_INT8 +}; + +inline CCLDataType ToCCLDataType(paddle::experimental::DataType type) { + if (type == paddle::experimental::DataType::FLOAT64) { + return CCL_DATA_TYPE_FP64; + } else if (type == paddle::experimental::DataType::FLOAT32) { + return CCL_DATA_TYPE_FP32; + } else if (type == paddle::experimental::DataType::FLOAT16) { + return CCL_DATA_TYPE_FP16; + } else if (type == paddle::experimental::DataType::INT64) { + return CCL_DATA_TYPE_INT64; + } else if (type == paddle::experimental::DataType::INT32) { + return CCL_DATA_TYPE_INT32; + } else if (type == paddle::experimental::DataType::INT8) { + return CCL_DATA_TYPE_INT8; + } else { + PADDLE_THROW( + phi::errors::Unimplemented("This datatype in CCL is not supported.")); + } +} + +} // namespace ccl +} // namespace phi diff --git a/paddle/phi/backends/callback_manager.cc b/paddle/phi/backends/callback_manager.cc index 295f70fc65cd7..7ce59880383c7 100644 --- a/paddle/phi/backends/callback_manager.cc +++ b/paddle/phi/backends/callback_manager.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/backends/device_guard.h" namespace phi { @@ -33,12 +34,13 @@ void CallbackManager::AddCallback(std::function callback) const { (*callback_func)(); }); }); - + phi::DeviceGuard guard(stream_->GetPlace()); phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace()) ->AddCallback(stream_, func); } void CallbackManager::Wait() const { + phi::DeviceGuard guard(stream_->GetPlace()); phi::DeviceManager::GetDeviceWithPlace(stream_->GetPlace()) ->SynchronizeStream(stream_); diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc index 541acd9ecafd0..1a92868dd07db 100644 --- a/paddle/phi/backends/custom/custom_device.cc +++ b/paddle/phi/backends/custom/custom_device.cc @@ -27,6 +27,14 @@ static bool operator==(const C_Device_st& d1, const C_Device_st& d2) { namespace phi { +#define INTERFACE_UNIMPLEMENT \ + PADDLE_THROW(phi::errors::Unimplemented( \ + "%s is not implemented on %s device.", __func__, Type())); +#define CHECK_PTR(x) \ + if (x == nullptr) { \ + INTERFACE_UNIMPLEMENT; \ + } + class CustomDevice : public DeviceInterface { public: CustomDevice(const std::string& type, @@ -561,6 +569,208 @@ class CustomDevice : public DeviceInterface { return version; } + C_DataType ToXCCLDataType(ccl::CCLDataType data_type) { +#define return_result(in, ret) \ + case ccl::CCLDataType::in: \ + return C_DataType::ret + switch (data_type) { + return_result(CCL_DATA_TYPE_FP64, FLOAT64); + return_result(CCL_DATA_TYPE_FP32, FLOAT32); + return_result(CCL_DATA_TYPE_FP16, FLOAT16); + return_result(CCL_DATA_TYPE_INT64, INT64); + return_result(CCL_DATA_TYPE_INT32, INT32); + return_result(CCL_DATA_TYPE_INT16, INT16); + return_result(CCL_DATA_TYPE_INT8, INT8); + default: { + PADDLE_THROW(phi::errors::Unavailable( + "DataType is not supported on %s.", Type())); + return C_DataType::UNDEFINED; + } + } +#undef return_result + } + + C_CCLReduceOp ToXCCLReduceOp(ccl::CCLReduceOp reduce_op) { +#define return_result(in, ret) \ + case ccl::CCLReduceOp::in: \ + return C_CCLReduceOp::ret + switch (reduce_op) { + return_result(SUM, SUM); + return_result(AVG, AVG); + return_result(MAX, MAX); + return_result(MIN, MIN); + return_result(PRODUCT, PRODUCT); + default: { + PADDLE_THROW(phi::errors::Unavailable( + "ReduceOp is not supported on %s.", Type())); + } + } +#undef return_result + } + + void CCLGetUniqueId(ccl::CCLRootId* unique_id) override { + CHECK_PTR(pimpl_->xccl_get_unique_id_size); + CHECK_PTR(pimpl_->xccl_get_unique_id); + + C_CCLRootId root_id; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->xccl_get_unique_id_size(&(root_id.sz))); + root_id.data = new uint8_t[root_id.sz]; + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_get_unique_id(&root_id)); + + uint8_t* ptr = reinterpret_cast(root_id.data); + *unique_id = std::vector(ptr, ptr + root_id.sz); + delete[] ptr; + } + + void CCLCommInitRank(size_t nranks, + ccl::CCLRootId* unique_id, + size_t rank, + ccl::CCLComm* comm) override { + CHECK_PTR(pimpl_->xccl_comm_init_rank); + + C_CCLRootId root_id; + root_id.sz = unique_id->size(); + root_id.data = unique_id->data(); + + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_comm_init_rank( + nranks, &root_id, rank, reinterpret_cast(comm))); + } + + void CCLDestroyComm(ccl::CCLComm comm) override { + CHECK_PTR(pimpl_->xccl_destroy_comm); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->xccl_destroy_comm(reinterpret_cast(comm))); + } + + void CCLAllReduce(void* send_buf, + void* recv_buf, + size_t count, + ccl::CCLDataType data_type, + ccl::CCLReduceOp op, + const ccl::CCLComm& comm, + const stream::Stream& stream) override { + CHECK_PTR(pimpl_->xccl_all_reduce); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_all_reduce( + send_buf, + recv_buf, + count, + ToXCCLDataType(data_type), + ToXCCLReduceOp(op), + reinterpret_cast(comm), + reinterpret_cast(stream.raw_stream()))); + } + + void CCLBroadcast(void* buf, + size_t count, + ccl::CCLDataType data_type, + size_t root, + const ccl::CCLComm& comm, + const stream::Stream& stream) override { + CHECK_PTR(pimpl_->xccl_broadcast); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_broadcast( + buf, + count, + ToXCCLDataType(data_type), + root, + reinterpret_cast(comm), + reinterpret_cast(stream.raw_stream()))); + } + + void CCLReduce(void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp reduce_op, + const ccl::CCLComm& comm, + const stream::Stream& stream) override { + CHECK_PTR(pimpl_->xccl_reduce); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->xccl_reduce(in_data, + out_data, + num, + ToXCCLDataType(data_type), + ToXCCLReduceOp(reduce_op), + reinterpret_cast(comm), + reinterpret_cast(stream.raw_stream()))); + } + + void CCLAllGather(void* send_buf, + void* recv_buf, + size_t count, + ccl::CCLDataType data_type, + const ccl::CCLComm& comm, + const stream::Stream& stream) override { + CHECK_PTR(pimpl_->xccl_all_gather); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_all_gather( + send_buf, + recv_buf, + count, + ToXCCLDataType(data_type), + reinterpret_cast(comm), + reinterpret_cast(stream.raw_stream()))); + } + + void CCLReduceScatter(void* send_buf, + void* recv_buf, + size_t count, + ccl::CCLDataType data_type, + ccl::CCLReduceOp reduce_op, + const ccl::CCLComm& comm, + const stream::Stream& stream) override { + CHECK_PTR(pimpl_->xccl_reduce_scatter); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_reduce_scatter( + send_buf, + recv_buf, + count, + ToXCCLDataType(data_type), + ToXCCLReduceOp(reduce_op), + reinterpret_cast(comm), + reinterpret_cast(stream.raw_stream()))); + } + + void CCLGroupStart() override { + CHECK_PTR(pimpl_->xccl_group_start); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_group_start()); + } + + void CCLGroupEnd() override { + CHECK_PTR(pimpl_->xccl_group_end); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS(pimpl_->xccl_group_end()); + } + + void CCLSend(void* send_buf, + size_t count, + ccl::CCLDataType data_type, + size_t dest_rank, + const ccl::CCLComm& comm, + const stream::Stream& stream) override { + CHECK_PTR(pimpl_->xccl_send); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->xccl_send(send_buf, + count, + ToXCCLDataType(data_type), + dest_rank, + reinterpret_cast(comm), + reinterpret_cast(stream.raw_stream()))); + } + + void CCLRecv(void* recv_buf, + size_t count, + ccl::CCLDataType data_type, + size_t src_rank, + const ccl::CCLComm& comm, + const stream::Stream& stream) override { + CHECK_PTR(pimpl_->xccl_recv); + PADDLE_ENFORCE_CUSTOM_DEVICE_SUCCESS( + pimpl_->xccl_recv(recv_buf, + count, + ToXCCLDataType(data_type), + src_rank, + reinterpret_cast(comm), + reinterpret_cast(stream.raw_stream()))); + } + private: inline int PlaceToIdNoCheck(const Place& place) { int dev_id = place.GetDeviceId(); @@ -584,7 +794,7 @@ class CustomDevice : public DeviceInterface { }; bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) { -#define CHECK_PTR(ptr, required) \ +#define CHECK_INTERFACE(ptr, required) \ if (params->interface->ptr == nullptr && required) { \ LOG(WARNING) << "CustomRuntime [type: " << params->device_type \ << "] pointer: " << #ptr << " is not set."; \ @@ -604,58 +814,71 @@ bool ValidCustomCustomRuntimeParams(const CustomRuntimeParams* params) { return false; } - CHECK_PTR(initialize, false); - CHECK_PTR(finalize, false) - - CHECK_PTR(init_device, false); - CHECK_PTR(set_device, true); - CHECK_PTR(get_device, true); - CHECK_PTR(deinit_device, false); - - CHECK_PTR(create_stream, true); - CHECK_PTR(destroy_stream, true); - CHECK_PTR(query_stream, false); - CHECK_PTR(stream_add_callback, false); - - CHECK_PTR(create_event, true); - CHECK_PTR(record_event, true); - CHECK_PTR(destroy_event, true); - CHECK_PTR(query_event, false); - - CHECK_PTR(synchronize_device, false); - CHECK_PTR(synchronize_stream, true); - CHECK_PTR(synchronize_event, true); - CHECK_PTR(stream_wait_event, true); - - CHECK_PTR(device_memory_allocate, true); - CHECK_PTR(device_memory_deallocate, true); - CHECK_PTR(host_memory_allocate, false); - CHECK_PTR(host_memory_deallocate, false); - CHECK_PTR(unified_memory_allocate, false); - CHECK_PTR(unified_memory_deallocate, false); - CHECK_PTR(memory_copy_h2d, true); - CHECK_PTR(memory_copy_d2h, true); - CHECK_PTR(memory_copy_d2d, true); - CHECK_PTR(memory_copy_p2p, false); - CHECK_PTR(async_memory_copy_h2d, false); - CHECK_PTR(async_memory_copy_d2h, false); - CHECK_PTR(async_memory_copy_d2d, false); - CHECK_PTR(async_memory_copy_p2p, false); - - CHECK_PTR(get_device_count, true); - CHECK_PTR(get_device_list, true); - CHECK_PTR(device_memory_stats, true); - - CHECK_PTR(device_min_chunk_size, true); - CHECK_PTR(device_max_chunk_size, false); - CHECK_PTR(device_max_alloc_size, false); - CHECK_PTR(device_extra_padding_size, false); - CHECK_PTR(get_compute_capability, false); - CHECK_PTR(get_runtime_version, false); - CHECK_PTR(get_driver_version, false); - + CHECK_INTERFACE(initialize, false); + CHECK_INTERFACE(finalize, false) + + CHECK_INTERFACE(init_device, false); + CHECK_INTERFACE(set_device, true); + CHECK_INTERFACE(get_device, true); + CHECK_INTERFACE(deinit_device, false); + + CHECK_INTERFACE(create_stream, true); + CHECK_INTERFACE(destroy_stream, true); + CHECK_INTERFACE(query_stream, false); + CHECK_INTERFACE(stream_add_callback, false); + + CHECK_INTERFACE(create_event, true); + CHECK_INTERFACE(record_event, true); + CHECK_INTERFACE(destroy_event, true); + CHECK_INTERFACE(query_event, false); + + CHECK_INTERFACE(synchronize_device, false); + CHECK_INTERFACE(synchronize_stream, true); + CHECK_INTERFACE(synchronize_event, true); + CHECK_INTERFACE(stream_wait_event, true); + + CHECK_INTERFACE(device_memory_allocate, true); + CHECK_INTERFACE(device_memory_deallocate, true); + CHECK_INTERFACE(host_memory_allocate, false); + CHECK_INTERFACE(host_memory_deallocate, false); + CHECK_INTERFACE(unified_memory_allocate, false); + CHECK_INTERFACE(unified_memory_deallocate, false); + CHECK_INTERFACE(memory_copy_h2d, true); + CHECK_INTERFACE(memory_copy_d2h, true); + CHECK_INTERFACE(memory_copy_d2d, true); + CHECK_INTERFACE(memory_copy_p2p, false); + CHECK_INTERFACE(async_memory_copy_h2d, false); + CHECK_INTERFACE(async_memory_copy_d2h, false); + CHECK_INTERFACE(async_memory_copy_d2d, false); + CHECK_INTERFACE(async_memory_copy_p2p, false); + + CHECK_INTERFACE(get_device_count, true); + CHECK_INTERFACE(get_device_list, true); + CHECK_INTERFACE(device_memory_stats, true); + + CHECK_INTERFACE(device_min_chunk_size, true); + CHECK_INTERFACE(device_max_chunk_size, false); + CHECK_INTERFACE(device_max_alloc_size, false); + CHECK_INTERFACE(device_extra_padding_size, false); + CHECK_INTERFACE(get_compute_capability, false); + CHECK_INTERFACE(get_runtime_version, false); + CHECK_INTERFACE(get_driver_version, false); + + CHECK_INTERFACE(xccl_get_unique_id, false); + CHECK_INTERFACE(xccl_get_unique_id_size, false); + CHECK_INTERFACE(xccl_comm_init_rank, false); + CHECK_INTERFACE(xccl_destroy_comm, false); + CHECK_INTERFACE(xccl_all_reduce, false); + CHECK_INTERFACE(xccl_broadcast, false); + CHECK_INTERFACE(xccl_reduce, false); + CHECK_INTERFACE(xccl_all_gather, false); + CHECK_INTERFACE(xccl_reduce_scatter, false); + CHECK_INTERFACE(xccl_group_start, false); + CHECK_INTERFACE(xccl_group_end, false); + CHECK_INTERFACE(xccl_send, false); + CHECK_INTERFACE(xccl_recv, false); return true; -#undef CHECK_PTR +#undef CHECK_INTERFACE } typedef bool (*RegisterDevicePluginFn)(CustomRuntimeParams* runtime_params); @@ -712,4 +935,6 @@ void LoadCustomRuntimeLib(const std::string& dso_lib_path, void* dso_handle) { LOG(INFO) << "Successed in loading custom runtime in lib: " << dso_lib_path; } +#undef INTERFACE_UNIMPLEMENT + } // namespace phi diff --git a/paddle/phi/backends/custom/custom_device_test.cc b/paddle/phi/backends/custom/custom_device_test.cc index 51fa74b4dc5f3..930750e864883 100644 --- a/paddle/phi/backends/custom/custom_device_test.cc +++ b/paddle/phi/backends/custom/custom_device_test.cc @@ -107,6 +107,7 @@ void TestTensorShareDataWith(const paddle::platform::Place& place) { } void TestTensorUtils(const paddle::platform::Place& place) { + std::cout << "TestTensorUtils on " << place << std::endl; if (paddle::platform::is_custom_place(place) == false) { return; } @@ -166,6 +167,76 @@ void TestTensorUtils(const paddle::platform::Place& place) { #endif } +void TestCustomCCL(const paddle::platform::Place& place) { + std::cout << "TestCustomCCL on " << place << std::endl; + if (paddle::platform::is_custom_place(place) == false) { + return; + } + std::string dev_type = place.GetDeviceType(); + phi::ccl::CCLComm comm; + phi::stream::Stream stream(place, nullptr); + phi::ccl::CCLRootId root_id; + + phi::DeviceManager::CCLDestroyComm(dev_type, nullptr); + phi::DeviceManager::CCLGetUniqueId(dev_type, &root_id); + phi::DeviceManager::CCLCommInitRank(dev_type, 0, &root_id, 0, nullptr); + phi::DeviceManager::CCLBroadcast(dev_type, + nullptr, + 0, + phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32, + 0, + comm, + stream); + phi::DeviceManager::CCLAllReduce(dev_type, + nullptr, + nullptr, + 0, + phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32, + phi::ccl::CCLReduceOp::SUM, + comm, + stream); + phi::DeviceManager::CCLReduce(dev_type, + nullptr, + nullptr, + 0, + phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32, + phi::ccl::CCLReduceOp::SUM, + comm, + stream); + phi::DeviceManager::CCLAllGather(dev_type, + nullptr, + nullptr, + 0, + phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32, + comm, + stream); + phi::DeviceManager::CCLReduceScatter( + dev_type, + nullptr, + nullptr, + 0, + phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32, + phi::ccl::CCLReduceOp::SUM, + comm, + stream); + phi::DeviceManager::CCLGroupStart(dev_type); + phi::DeviceManager::CCLGroupEnd(dev_type); + phi::DeviceManager::CCLSend(dev_type, + nullptr, + 0, + phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32, + 0, + comm, + stream); + phi::DeviceManager::CCLRecv(dev_type, + nullptr, + 0, + phi::ccl::CCLDataType::CCL_DATA_TYPE_FP32, + 0, + comm, + stream); +} + TEST(CustomDevice, Tensor) { InitDevice(); auto dev_types = phi::DeviceManager::GetAllDeviceTypes(); @@ -179,6 +250,7 @@ TEST(CustomDevice, Tensor) { TestTensorMutableData(place); TestTensorShareDataWith(place); TestTensorUtils(place); + TestCustomCCL(place); } } diff --git a/paddle/phi/backends/custom/fake_cpu_device.h b/paddle/phi/backends/custom/fake_cpu_device.h index 22c344a0e0488..41c7acc4469cd 100644 --- a/paddle/phi/backends/custom/fake_cpu_device.h +++ b/paddle/phi/backends/custom/fake_cpu_device.h @@ -136,6 +136,80 @@ C_Status DeviceMaxAllocSize(const C_Device device, size_t *size) { return C_SUCCESS; } +C_Status XcclGetUniqueIdSize(size_t *size) { + *size = sizeof(size_t); + return C_SUCCESS; +} +C_Status XcclGetUniqueId(C_CCLRootId *unique_id) { return C_SUCCESS; } +C_Status XcclCommInitRank(size_t ranks, + C_CCLRootId *unique_id, + size_t rank, + C_CCLComm *comm) { + return C_SUCCESS; +} +C_Status XcclDestroyComm(C_CCLComm comm) { return C_SUCCESS; } +C_Status XcclAllReduce(void *send_buf, + void *recv_buf, + size_t count, + C_DataType data_type, + C_CCLReduceOp op, + C_CCLComm comm, + C_Stream stream) { + return C_SUCCESS; +} +C_Status XcclBroadcast(void *buf, + size_t count, + C_DataType data_type, + size_t root, + C_CCLComm comm, + C_Stream stream) { + return C_SUCCESS; +} +C_Status XcclReduce(void *send_buf, + void *recv_buf, + size_t count, + C_DataType data_type, + C_CCLReduceOp op, + C_CCLComm comm, + C_Stream stream) { + return C_SUCCESS; +} +C_Status XcclAllGather(void *send_buf, + void *recv_buf, + size_t count, + C_DataType data_type, + C_CCLComm comm, + C_Stream stream) { + return C_SUCCESS; +} +C_Status XcclReduceScatter(void *send_buf, + void *recv_buf, + size_t count, + C_DataType data_type, + C_CCLReduceOp op, + C_CCLComm comm, + C_Stream stream) { + return C_SUCCESS; +} +C_Status XcclGroupStart() { return C_SUCCESS; } +C_Status XcclGroupEnd() { return C_SUCCESS; } +C_Status XcclSend(void *send_buf, + size_t count, + C_DataType data_type, + size_t dest_rank, + C_CCLComm comm, + C_Stream stream) { + return C_SUCCESS; +} +C_Status XcclRecv(void *recv_buf, + size_t count, + C_DataType data_type, + size_t src_rank, + C_CCLComm comm, + C_Stream stream) { + return C_SUCCESS; +} + #define DEVICE_TYPE "FakeCPU" #define SUB_DEVICE_TYPE "V100" @@ -190,4 +264,18 @@ void InitFakeCPUDevice(CustomRuntimeParams *params) { params->interface->device_max_chunk_size = DeviceMaxChunkSize; params->interface->device_min_chunk_size = DeviceMinChunkSize; params->interface->device_max_alloc_size = DeviceMaxAllocSize; + + params->interface->xccl_get_unique_id_size = XcclGetUniqueIdSize; + params->interface->xccl_get_unique_id = XcclGetUniqueId; + params->interface->xccl_all_reduce = XcclAllReduce; + params->interface->xccl_all_gather = XcclAllGather; + params->interface->xccl_broadcast = XcclBroadcast; + params->interface->xccl_comm_init_rank = XcclCommInitRank; + params->interface->xccl_destroy_comm = XcclDestroyComm; + params->interface->xccl_group_end = XcclGroupEnd; + params->interface->xccl_group_start = XcclGroupStart; + params->interface->xccl_reduce = XcclReduce; + params->interface->xccl_reduce_scatter = XcclReduceScatter; + params->interface->xccl_send = XcclSend; + params->interface->xccl_recv = XcclRecv; } diff --git a/paddle/phi/backends/device_base.cc b/paddle/phi/backends/device_base.cc index e57653702c538..4b82f4a340ebb 100644 --- a/paddle/phi/backends/device_base.cc +++ b/paddle/phi/backends/device_base.cc @@ -270,4 +270,91 @@ size_t DeviceInterface::GetExtraPaddingSize(size_t dev_id) { return 0; } +void DeviceInterface::CCLDestroyComm(ccl::CCLComm ccl_comm) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::CCLCommInitRank(size_t num_ranks, + ccl::CCLRootId* root_id, + size_t rank_id, + ccl::CCLComm* ccl_comm) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::CCLGetUniqueId(ccl::CCLRootId* root_id) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::CCLBroadcast(void* data, + size_t num, + ccl::CCLDataType data_type, + size_t root, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::CCLAllReduce(void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp reduce_op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::CCLReduce(void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp reduce_op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::CCLAllGather(void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::CCLReduceScatter(void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::CCLGroupStart() { INTERFACE_UNIMPLEMENT; } + +void DeviceInterface::CCLGroupEnd() { INTERFACE_UNIMPLEMENT; } + +void DeviceInterface::CCLSend(void* sendbuf, + size_t num, + ccl::CCLDataType data_type, + size_t dst_rank, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + INTERFACE_UNIMPLEMENT; +} + +void DeviceInterface::CCLRecv(void* recvbuf, + size_t num, + ccl::CCLDataType data_type, + size_t src_rank, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + INTERFACE_UNIMPLEMENT; +} + +#undef INTERFACE_UNIMPLEMENT + } // namespace phi diff --git a/paddle/phi/backends/device_base.h b/paddle/phi/backends/device_base.h index 8cc6e498068fa..84249261d1962 100644 --- a/paddle/phi/backends/device_base.h +++ b/paddle/phi/backends/device_base.h @@ -16,6 +16,7 @@ #ifdef PADDLE_WITH_CUSTOM_DEVICE #include +#include "paddle/phi/backends/c_comm_lib.h" #include "paddle/phi/backends/event.h" #include "paddle/phi/backends/stream.h" @@ -165,6 +166,65 @@ class DeviceInterface { // Driver / Runtime virtual size_t GetExtraPaddingSize(size_t dev_id); + // CCL + virtual void CCLDestroyComm(ccl::CCLComm ccl_comm); + + virtual void CCLCommInitRank(size_t num_ranks, + ccl::CCLRootId* root_id, + size_t rank_id, + ccl::CCLComm* ccl_comm); + + virtual void CCLGetUniqueId(ccl::CCLRootId* root_id); + + virtual void CCLBroadcast(void* data, + size_t num, + ccl::CCLDataType data_type, + size_t root, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + + virtual void CCLAllReduce(void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp reduce_op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + virtual void CCLReduce(void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp reduce_op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + virtual void CCLAllGather(void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + virtual void CCLReduceScatter(void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + virtual void CCLGroupStart(); + virtual void CCLGroupEnd(); + virtual void CCLSend(void* sendbuf, + size_t num, + ccl::CCLDataType data_type, + size_t dst_rank, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + virtual void CCLRecv(void* recvbuf, + size_t num, + ccl::CCLDataType data_type, + size_t src_rank, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + private: const std::string type_; const uint8_t priority_; diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h index 77c9ee61858c1..a4dc9176e1b1e 100644 --- a/paddle/phi/backends/device_ext.h +++ b/paddle/phi/backends/device_ext.h @@ -74,6 +74,15 @@ typedef void (*C_Callback)(C_Device device, void* user_data, C_Status* status); +typedef struct { + size_t sz; + void* data; +} C_CCLRootId; + +typedef struct C_CCLComm_st* C_CCLComm; + +typedef enum { SUM = 0, AVG, MAX, MIN, PRODUCT } C_CCLReduceOp; + struct C_DeviceInterface { // Core fill it and plugin must to check it size_t size; @@ -526,6 +535,102 @@ struct C_DeviceInterface { void* reserved_info_api[8]; + ////////////// + // ccl api // + ////////////// + + /** + * @brief Get size of unique id + * + * @param[size_t*] size + */ + C_Status (*xccl_get_unique_id_size)(size_t* size); + + /** + * @brief Get unique id + * + * @param[C_CCLRootId*] unique_id + */ + C_Status (*xccl_get_unique_id)(C_CCLRootId* unique_id); + + /** + * @brief Initialize communicator + * + * @param[size_t] ranks + * @param[C_CCLRootId*] unique_id + * @param[size_t] rank + * @param[C_CCLComm*] comm + */ + C_Status (*xccl_comm_init_rank)(size_t ranks, + C_CCLRootId* unique_id, + size_t rank, + C_CCLComm* comm); + + /** + * @brief Destroy communicator + * + * @param[C_CCLComm] comm + */ + C_Status (*xccl_destroy_comm)(C_CCLComm comm); + + C_Status (*xccl_all_reduce)(void* send_buf, + void* recv_buf, + size_t count, + C_DataType data_type, + C_CCLReduceOp op, + C_CCLComm comm, + C_Stream stream); + + C_Status (*xccl_broadcast)(void* buf, + size_t count, + C_DataType data_type, + size_t root, + C_CCLComm comm, + C_Stream stream); + + C_Status (*xccl_reduce)(void* send_buf, + void* recv_buf, + size_t count, + C_DataType data_type, + C_CCLReduceOp op, + C_CCLComm comm, + C_Stream stream); + + C_Status (*xccl_all_gather)(void* send_buf, + void* recv_buf, + size_t count, + C_DataType data_type, + C_CCLComm comm, + C_Stream stream); + + C_Status (*xccl_reduce_scatter)(void* send_buf, + void* recv_buf, + size_t count, + C_DataType data_type, + C_CCLReduceOp op, + C_CCLComm comm, + C_Stream stream); + + C_Status (*xccl_group_start)(); + + C_Status (*xccl_group_end)(); + + C_Status (*xccl_send)(void* send_buf, + size_t count, + C_DataType data_type, + size_t dest_rank, + C_CCLComm comm, + C_Stream stream); + + C_Status (*xccl_recv)(void* recv_buf, + size_t count, + C_DataType data_type, + size_t src_rank, + C_CCLComm comm, + C_Stream stream); + + void* reserved_ccl_api[8]; + /////////////// // other api // /////////////// diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc index 5b1022794a5c3..405a87f7496a8 100644 --- a/paddle/phi/backends/device_manager.cc +++ b/paddle/phi/backends/device_manager.cc @@ -25,6 +25,7 @@ #include #include "glog/logging.h" +#include "paddle/utils/string/split.h" namespace phi { @@ -390,6 +391,139 @@ std::vector DeviceManager::GetDeviceList( return dev_impl->GetDeviceList(); } +std::vector DeviceManager::GetSelectedDeviceList( + const std::string& device_type) { + std::vector devices; + std::string FLAGS = "FLAGS_selected_" + device_type + "s"; + auto FLAGS_selected_devices = getenv(FLAGS.c_str()); + if (FLAGS_selected_devices) { + auto devices_str = paddle::string::Split(FLAGS_selected_devices, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = DeviceManager::GetDeviceCount(device_type); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + +void DeviceManager::CCLDestroyComm(const std::string& device_type, + ccl::CCLComm ccl_comm) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLDestroyComm(ccl_comm); +} + +void DeviceManager::CCLCommInitRank(const std::string& device_type, + size_t num_ranks, + ccl::CCLRootId* root_id, + size_t rank_id, + ccl::CCLComm* ccl_comm) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLCommInitRank(num_ranks, root_id, rank_id, ccl_comm); +} + +void DeviceManager::CCLGetUniqueId(const std::string& device_type, + ccl::CCLRootId* root_id) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLGetUniqueId(root_id); +} + +void DeviceManager::CCLBroadcast(const std::string& device_type, + void* data, + size_t num, + ccl::CCLDataType data_type, + size_t root_id, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLBroadcast(data, num, data_type, root_id, ccl_comm, stream); +} + +void DeviceManager::CCLAllReduce(const std::string& device_type, + void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp reduce_op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLAllReduce( + in_data, out_data, num, data_type, reduce_op, ccl_comm, stream); +} + +void DeviceManager::CCLReduce(const std::string& device_type, + void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp reduce_op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLReduce( + in_data, out_data, num, data_type, reduce_op, ccl_comm, stream); +} + +void DeviceManager::CCLAllGather(const std::string& device_type, + void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLAllGather(in_data, out_data, num, data_type, ccl_comm, stream); +} + +void DeviceManager::CCLReduceScatter(const std::string& device_type, + void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLReduceScatter( + in_data, out_data, num, data_type, op, ccl_comm, stream); +} + +void DeviceManager::CCLGroupStart(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLGroupStart(); +} + +void DeviceManager::CCLGroupEnd(const std::string& device_type) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLGroupEnd(); +} + +void DeviceManager::CCLSend(const std::string& device_type, + void* sendbuf, + size_t num, + ccl::CCLDataType data_type, + size_t dst_rank, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLSend(sendbuf, num, data_type, dst_rank, ccl_comm, stream); +} + +void DeviceManager::CCLRecv(const std::string& device_type, + void* recvbuf, + size_t num, + ccl::CCLDataType data_type, + size_t src_rank, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream) { + auto dev_impl = GetDeviceInterfaceWithType(device_type); + dev_impl->CCLRecv(recvbuf, num, data_type, src_rank, ccl_comm, stream); +} + DeviceManager& DeviceManager::Instance() { static DeviceManager platform_manager; return platform_manager; diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h index 56d99ba43bdd1..4ad7643c33d3c 100644 --- a/paddle/phi/backends/device_manager.h +++ b/paddle/phi/backends/device_manager.h @@ -17,6 +17,7 @@ #include +#include "paddle/phi/backends/c_comm_lib.h" #include "paddle/phi/backends/device_base.h" #include "paddle/phi/backends/device_ext.h" #include "paddle/phi/backends/dynload/port.h" @@ -159,6 +160,74 @@ class DeviceManager { static std::vector GetDeviceList(const std::string& device_type); + static std::vector GetSelectedDeviceList( + const std::string& device_type); + + // CCL + static void CCLDestroyComm(const std::string& device_type, + ccl::CCLComm ccl_comm); + static void CCLCommInitRank(const std::string& device_type, + size_t num_ranks, + ccl::CCLRootId* root_id, + size_t rank_id, + ccl::CCLComm* ccl_comm); + static void CCLGetUniqueId(const std::string& device_type, + ccl::CCLRootId* root_id); + static void CCLBroadcast(const std::string& device_type, + void* data, + size_t num, + ccl::CCLDataType data_type, + size_t root, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + static void CCLAllReduce(const std::string& device_type, + void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp reduce_op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + static void CCLReduce(const std::string& device_type, + void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp reduce_op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + static void CCLAllGather(const std::string& device_type, + void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + static void CCLReduceScatter(const std::string& device_type, + void* in_data, + void* out_data, + size_t num, + ccl::CCLDataType data_type, + ccl::CCLReduceOp op, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + static void CCLGroupStart(const std::string& device_type); + static void CCLGroupEnd(const std::string& device_type); + static void CCLSend(const std::string& device_type, + void* sendbuf, + size_t num, + ccl::CCLDataType data_type, + size_t dst_rank, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + static void CCLRecv(const std::string& device_type, + void* recvbuf, + size_t num, + ccl::CCLDataType data_type, + size_t src_rank, + const ccl::CCLComm& ccl_comm, + const stream::Stream& stream); + static void Clear(); private: diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py index 4fcf9c5d21b26..aa959150cec3c 100644 --- a/python/paddle/device/__init__.py +++ b/python/paddle/device/__init__.py @@ -230,7 +230,10 @@ def _convert_to_place(device): device_id = int(selected_mlus[0]) place = core.MLUPlace(device_id) elif device in core.get_all_custom_device_type(): - place = core.CustomPlace(device, 0) + selected_devices = os.getenv("FLAGS_selected_{}s".format(device), + "0").split(",") + device_id = int(selected_devices[0]) + place = core.CustomPlace(device, device_id) else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) From a2c4c86b1c139ca8242355d673b78e746d189f54 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Thu, 14 Jul 2022 15:45:01 +0800 Subject: [PATCH 206/250] Fix var duplication bug for graph_to_program_pass (#44278) --- paddle/fluid/framework/ir/graph_helper.cc | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 97f486065ac62..b0a2b6754cb2a 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -579,6 +579,12 @@ void GraphToProgram(const Graph &graph, VLOG(3) << "Graph to program need convert " << graph.SubGraphsSize() << " sub graph"; + + std::unordered_set vars_in_root_block; + for (const proto::VarDesc &var : block->vars()) { + vars_in_root_block.insert(var.name()); + } + for (size_t idx = 0; idx < graph.SubGraphsSize(); ++idx) { // avoid kRootBlockIndex not 0 if (idx == kRootBlockIndex) continue; @@ -586,7 +592,14 @@ void GraphToProgram(const Graph &graph, block = program_pb.add_blocks(); block->set_idx(idx); block->set_parent_idx(kRootBlockIndex); - GraphToBlock(*graph.GetSubGraph(idx), block, sort_kind); + + Graph *subgraph = graph.GetSubGraph(idx); + subgraph->SetNotOwned>( + kGraphToProgramVarsToRemove, &vars_in_root_block); + + GraphToBlock(*subgraph, block, sort_kind); + + subgraph->Erase(kGraphToProgramVarsToRemove); } } else { GraphToBlock(graph, block, sort_kind); From 1bc47c8468363ffc22190416968bfbc1a5078132 Mon Sep 17 00:00:00 2001 From: Yao Zihang <1162526220@qq.com> Date: Thu, 14 Jul 2022 16:00:52 +0800 Subject: [PATCH 207/250] Optimize batchnorm1d using 2D kernel (#43530) --- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 6 +- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 522 +++++++++++++++++- .../tests/unittests/test_batch_norm_op_v2.py | 59 +- 3 files changed, 549 insertions(+), 38 deletions(-) diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index b23b119342d68..0f028f42a956c 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -591,10 +591,12 @@ void BatchNormGradRawKernel(const Context &ctx, // ctx.GetPlace()), // epsilon, saved_mean_data, saved_var_data)); #else - // CUDNN PER_ACTIVATION mode only support small batch size + // CUDNN only support small batch size const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; + const size_t CUDNN_SPATIAL_THRESHOLD = 880801; const bool use_native_kernel = - (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD); + ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) || + (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD)); if (use_native_kernel) { if (compute_format == DataLayout::kNCHW) { BNBackward diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 702722591553f..61694db7e8ed3 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -31,6 +31,7 @@ namespace cub = hipcub; #include "paddle/phi/kernels/batch_norm_kernel.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/norm_utils.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" #include "paddle/phi/kernels/gpu/batch_norm_utils.h" #ifdef __HIPCC__ @@ -137,6 +138,398 @@ static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( } } +template +__device__ __forceinline__ void merge_block_vertical( + BatchNormParamType x_sum, + BatchNormParamType x_square_sum, + BatchNormParamType *smem_sum, + BatchNormParamType *smem_square_sum, + BatchNormParamType *x_sum_out, + BatchNormParamType *x_square_sum_out) { + int tid = threadIdx.x + threadIdx.y * blockDim.x; +#pragma unroll + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + if (threadIdx.y < offset * 2) { + smem_sum[tid] = x_sum; + smem_square_sum[tid] = x_square_sum; + } + __syncthreads(); + if (threadIdx.y < offset) { + int pair_tid = tid + offset * blockDim.x; + x_sum += smem_sum[pair_tid]; + x_square_sum += smem_square_sum[pair_tid]; + } + } + if (threadIdx.y == 0) { + *x_sum_out = x_sum; + *x_square_sum_out = x_square_sum; + } +} + +template +__device__ __forceinline__ void merge_block_horizonal( + BatchNormParamType x_sum, + BatchNormParamType x_square_sum, + BatchNormParamType *smem_sum, + BatchNormParamType *smem_square_sum, + BatchNormParamType *x_sum_out, + BatchNormParamType *x_square_sum_out) { + int tid = threadIdx.x + threadIdx.y * blockDim.x; +#pragma unroll + for (int offset = blockDim.x / 2; offset > 0; offset >>= 1) { + if (threadIdx.x < offset * 2) { + smem_sum[tid] = x_sum; + smem_square_sum[tid] = x_square_sum; + } + __syncthreads(); + if (threadIdx.x < offset) { + int pair_tid = tid + offset; + x_sum += smem_sum[pair_tid]; + x_square_sum += smem_square_sum[pair_tid]; + } + } + if (threadIdx.x == 0) { + *x_sum_out = x_sum; + *x_square_sum_out = x_square_sum; + } +} + +template +static __global__ void BNForwardTraining2DChannelLastCompStat( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *global_mean, + BatchNormParamType *global_variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance, + BatchNormParamType *compute_mean, + BatchNormParamType *compute_inv_var, + BatchNormParamType *block_data_ptr, + int *flag_ptr) { + int outer_size = C; + int inner_size = N * HxW; + + __shared__ BatchNormParamType smem_sum[BlockDim]; + __shared__ BatchNormParamType smem_square_sum[BlockDim]; + + int outer_loop_stride = gridDim.x * blockDim.x; + int inner_loop_stride = gridDim.y * blockDim.y; + + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size; + i += outer_loop_stride) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; + j += inner_loop_stride) { + const int index = j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + + // vertical block sum + merge_block_vertical(x_sum, + x_square_sum, + &smem_sum[0], + &smem_square_sum[0], + &x_sum, + &x_square_sum); + + if (gridDim.y > 1) { + volatile BatchNormParamType *staging_sum = block_data_ptr; + volatile BatchNormParamType *staging_square_sum = + &block_data_ptr[C * gridDim.y]; + // write block data to global memory + if (threadIdx.y == 0) { + staging_sum[i + blockIdx.y * C] = x_sum; + staging_square_sum[i + blockIdx.y * C] = x_square_sum; + } + + // make sure write is visible to all blocks + __threadfence(); + __syncthreads(); + + __shared__ bool is_last_block_done; + // mark block done + if (threadIdx.x == 0 && threadIdx.y == 0) { + int old = atomicAdd(&flag_ptr[blockIdx.x], 1); + is_last_block_done = (old == (gridDim.y - 1)); + } + + __syncthreads(); + + if (is_last_block_done) { + x_sum = static_cast>(0); + x_square_sum = static_cast>(0); + // thread sum + for (int y = threadIdx.y; y < gridDim.y; y += blockDim.y) { + x_sum += staging_sum[i + y * C]; + x_square_sum += staging_square_sum[i + y * C]; + } + + // vertical block sum + merge_block_vertical(x_sum, + x_square_sum, + &smem_sum[0], + &smem_square_sum[0], + &x_sum, + &x_square_sum); + + // final compute + if (threadIdx.y == 0) { + BatchNormParamType compute_mean_val = x_sum / inner_size; + BatchNormParamType variance_val = + x_square_sum / inner_size - compute_mean_val * compute_mean_val; + BatchNormParamType compute_inv_var_val = + 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = compute_mean_val; + save_inv_variance[i] = compute_inv_var_val; + } + global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val + + exponentialAverageFactor * global_mean[i]; + global_variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * global_variance[i]; + + compute_mean[i] = compute_mean_val; + compute_inv_var[i] = compute_inv_var_val; + } + } + } else { + if (blockIdx.y == 0 && threadIdx.y == 0) { + BatchNormParamType compute_mean_val = x_sum / inner_size; + BatchNormParamType variance_val = + x_square_sum / inner_size - compute_mean_val * compute_mean_val; + BatchNormParamType compute_inv_var_val = + 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = compute_mean_val; + save_inv_variance[i] = compute_inv_var_val; + } + global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val + + exponentialAverageFactor * global_mean[i]; + global_variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * global_variance[i]; + + compute_mean[i] = compute_mean_val; + compute_inv_var[i] = compute_inv_var_val; + } + } + } +} + +template +static __global__ void BNForwardTraining2DChannelLastWriteRes( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + T *y, + BatchNormParamType *compute_mean, + BatchNormParamType *compute_inv_var) { + int outer_size = C; + int inner_size = N * HxW; + + int outer_loop_stride = gridDim.x * blockDim.x; + int inner_loop_stride = gridDim.y * blockDim.y; + + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < outer_size; + i += outer_loop_stride) { + BatchNormParamType mean_val = compute_mean[i]; + BatchNormParamType inv_var_val = compute_inv_var[i]; + BatchNormParamType scale_val = scale[i]; + BatchNormParamType bias_val = bias[i]; + + for (int j = blockIdx.y * blockDim.y + threadIdx.y; j < inner_size; + j += inner_loop_stride) { + const int index = j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale_val * x_sub_mean * inv_var_val + bias_val; + } + } +} + +template +static __global__ void BNForwardTraining2DCompStat( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *global_mean, + BatchNormParamType *global_variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance, + BatchNormParamType *compute_mean, + BatchNormParamType *compute_inv_var, + BatchNormParamType *block_data_ptr, + int *flag_ptr) { + int outer_size = C; + int inner_size = N * HxW; + + __shared__ BatchNormParamType smem_sum[BlockDim]; + __shared__ BatchNormParamType smem_square_sum[BlockDim]; + + int outer_loop_stride = gridDim.y * blockDim.y; + int inner_loop_stride = gridDim.x * blockDim.x; + + for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size; + i += outer_loop_stride) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size; + j += inner_loop_stride) { + const int index = (j / HxW * C + i) * HxW + j % HxW; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + + // horizonal block sum + merge_block_horizonal(x_sum, + x_square_sum, + &smem_sum[0], + &smem_square_sum[0], + &x_sum, + &x_square_sum); + + if (gridDim.x > 1) { + volatile BatchNormParamType *staging_sum = block_data_ptr; + volatile BatchNormParamType *staging_square_sum = + &block_data_ptr[C * gridDim.x]; + // write block data to global memory + if (threadIdx.x == 0) { + staging_sum[i + blockIdx.x * C] = x_sum; + staging_square_sum[i + blockIdx.x * C] = x_square_sum; + } + + // make sure write is visible to all blocks + __threadfence(); + __syncthreads(); + + __shared__ bool is_last_block_done; + // mark block done + if (threadIdx.x == 0 && threadIdx.y == 0) { + int old = atomicAdd(&flag_ptr[blockIdx.y], 1); + is_last_block_done = (old == (gridDim.x - 1)); + } + + __syncthreads(); + + if (is_last_block_done) { + x_sum = static_cast>(0); + x_square_sum = static_cast>(0); + // thread sum + for (int x = threadIdx.x; x < gridDim.x; x += blockDim.x) { + x_sum += staging_sum[i + x * C]; + x_square_sum += staging_square_sum[i + x * C]; + } + + // horizonal block sum + merge_block_horizonal(x_sum, + x_square_sum, + &smem_sum[0], + &smem_square_sum[0], + &x_sum, + &x_square_sum); + + // final compute + if (threadIdx.x == 0) { + BatchNormParamType compute_mean_val = x_sum / inner_size; + BatchNormParamType variance_val = + x_square_sum / inner_size - compute_mean_val * compute_mean_val; + BatchNormParamType compute_inv_var_val = + 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = compute_mean_val; + save_inv_variance[i] = compute_inv_var_val; + } + global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val + + exponentialAverageFactor * global_mean[i]; + global_variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * global_variance[i]; + + compute_mean[i] = compute_mean_val; + compute_inv_var[i] = compute_inv_var_val; + } + } + } else { + if (blockIdx.x == 0 && threadIdx.x == 0) { + BatchNormParamType compute_mean_val = x_sum / inner_size; + BatchNormParamType variance_val = + x_square_sum / inner_size - compute_mean_val * compute_mean_val; + BatchNormParamType compute_inv_var_val = + 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = compute_mean_val; + save_inv_variance[i] = compute_inv_var_val; + } + global_mean[i] = (1 - exponentialAverageFactor) * compute_mean_val + + exponentialAverageFactor * global_mean[i]; + global_variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * global_variance[i]; + + compute_mean[i] = compute_mean_val; + compute_inv_var[i] = compute_inv_var_val; + } + } + } +} + +template +static __global__ void BNForwardTraining2DWriteRes( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + T *y, + BatchNormParamType *compute_mean, + BatchNormParamType *compute_inv_var) { + int outer_size = C; + int inner_size = N * HxW; + + int outer_loop_stride = gridDim.y * blockDim.y; + int inner_loop_stride = gridDim.x * blockDim.x; + + for (int i = blockIdx.y * blockDim.y + threadIdx.y; i < outer_size; + i += outer_loop_stride) { + BatchNormParamType mean_val = compute_mean[i]; + BatchNormParamType inv_var_val = compute_inv_var[i]; + BatchNormParamType scale_val = scale[i]; + BatchNormParamType bias_val = bias[i]; + + for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < inner_size; + j += inner_loop_stride) { + const int index = (j / HxW * C + i) * HxW + j % HxW; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale_val * x_sub_mean * inv_var_val + bias_val; + } + } +} + template void BatchNormKernel(const Context &ctx, const DenseTensor &x, @@ -515,17 +908,63 @@ void BatchNormKernel(const Context &ctx, // static_cast(saved_variance->template mutable_data< // BatchNormParamType>(ctx.GetPlace())))); #else - // CUDNN PER_ACTIVATION mode only support small batch size const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070; + const size_t CUDNN_SPATIAL_THRESHOLD = 880801; const bool use_native_kernel = - (x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD); + ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) || + (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD)); if (use_native_kernel) { - const int block = 512; - const int max_threads = ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min(C, max_blocks); - if (compute_format == DataLayout::kNCHW) { - BNForwardTraining + dim3 block; + dim3 grid; + const int block_size = 512; + const int MAX_GRID_SIZE = 128; + const int WARP_SIZE = 32; + + // init intermediate storage + DenseTensor block_data_tensor; + DenseTensor flag_tensor; + DenseTensor compute_mean_tensor = + phi::Empty, Context>(ctx, {C}); + DenseTensor compute_inv_var_tensor = + phi::Empty, Context>(ctx, {C}); + + BatchNormParamType *block_data_ptr = nullptr; + int *flag_ptr = nullptr; + + if (x_dims.size() != 2 && compute_format == DataLayout::kNCHW) { + // init block&grid config + int block_x = + std::min(phi::funcs::details::GetLastPow2(H * W * D), block_size); + int block_y = std::min(phi::funcs::details::GetLastPow2(C), + block_size / block_x); + + if (block_x * block_y != block_size) { + block_x = + std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), + block_size / block_y); + } + + int grid_x = + std::min((N * H * W * D + block_x * 16 - 1) / (block_x * 16), + MAX_GRID_SIZE); + int grid_y = (C + block_y - 1) / block_y; + + block.x = block_x; + block.y = block_y; + grid.x = grid_x; + grid.y = grid_y; + + if (grid.x > 1) { + block_data_tensor = phi::Empty, Context>( + ctx, {2 * C * grid.x}); + flag_tensor = phi::Empty(ctx, {grid.y}); + + block_data_ptr = block_data_tensor.data>(); + flag_ptr = flag_tensor.data(); + funcs::SetConstant set_zero; + set_zero(ctx, &flag_tensor, static_cast(0)); + } + BNForwardTraining2DCompStat <<>>( transformed_x.template data(), scale.template data>(), @@ -539,9 +978,54 @@ void BatchNormKernel(const Context &ctx, mean_out->template data>(), variance_out->template data>(), saved_mean->template data>(), - saved_variance->template data>()); + saved_variance->template data>(), + compute_mean_tensor.data>(), + compute_inv_var_tensor.data>(), + block_data_ptr, + flag_ptr); + + BNForwardTraining2DWriteRes<<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + transformed_y.template data(), + compute_mean_tensor.data>(), + compute_inv_var_tensor.data>()); } else { - BNForwardTraining + // init block&grid config + int block_x = + std::min(phi::funcs::details::GetLastPow2(C), WARP_SIZE); + int block_y = + std::min(phi::funcs::details::GetLastPow2(N * H * W * D / 16), + block_size / block_x); + if (block_x * block_y != block_size) { + block_x = std::min(phi::funcs::details::GetLastPow2(C), + block_size / block_y); + } + int grid_x = (C + block_x - 1) / block_x; + int grid_y = + std::min((N * H * W * D + block_y * 16 - 1) / (block_y * 16), + MAX_GRID_SIZE); + + block.x = block_x; + block.y = block_y; + grid.x = grid_x; + grid.y = grid_y; + + if (grid.y > 1) { + block_data_tensor = phi::Empty, Context>( + ctx, {2 * C * grid.y}); + flag_tensor = phi::Empty(ctx, {grid.x}); + + block_data_ptr = block_data_tensor.data>(); + flag_ptr = flag_tensor.data(); + funcs::SetConstant set_zero; + set_zero(ctx, &flag_tensor, static_cast(0)); + } + BNForwardTraining2DChannelLastCompStat <<>>( transformed_x.template data(), scale.template data>(), @@ -555,7 +1039,23 @@ void BatchNormKernel(const Context &ctx, mean_out->template data>(), variance_out->template data>(), saved_mean->template data>(), - saved_variance->template data>()); + saved_variance->template data>(), + compute_mean_tensor.data>(), + compute_inv_var_tensor.data>(), + block_data_ptr, + flag_ptr); + + BNForwardTraining2DChannelLastWriteRes + <<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + transformed_y.template data(), + compute_mean_tensor.data>(), + compute_inv_var_tensor.data>()); } } else { #if CUDNN_VERSION_MIN(7, 4, 1) diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py index cfd5d5f7c9bd0..7aa3b8cddf80c 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py @@ -82,50 +82,58 @@ def error3d(): self.assertRaises(ValueError, error2d_dataformat) self.assertRaises(ValueError, error3d_dataformat) - def test_eager_api(self): - places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(fluid.CUDAPlace(0)) - for p in places: - shape = [4, 10, 4, 4] + def test_large_batch(self): - def compute_v1(x): - with fluid.dygraph.guard(p): - bn = fluid.dygraph.BatchNorm(shape[1]) - #bn = paddle.nn.BatchNorm2D(shape[1]) + def compute_baseline(x): + with fluid.dygraph.guard(p): + bn = fluid.dygraph.BatchNorm(shape[1]) + x1 = paddle.to_tensor(x) + x1.stop_gradient = False + y = bn(x1) + y.backward() + return y.numpy(), x1.gradient() + + def compute_1d(x): + with fluid.dygraph.guard(p): + with _test_eager_guard(): + bn = paddle.nn.BatchNorm1D(shape[1]) x1 = paddle.to_tensor(x) x1.stop_gradient = False y = bn(x1) y.backward() return y.numpy(), x1.gradient() - def compute_v2(x): - with fluid.dygraph.guard(p): - with _test_eager_guard(): - print("v2") - bn = paddle.nn.BatchNorm2D(shape[1]) - x1 = paddle.to_tensor(x) - x1.stop_gradient = False - y = bn(x1) - y.backward() - return y.numpy(), x1.gradient() + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + # [N, C] + shape = [200000, 4] + x = np.random.randn(*shape).astype("float32") + y1, g1 = compute_baseline(x) + y2, g2 = compute_1d(x) + self.assertTrue(np.allclose(g1, g2)) + self.assertTrue(np.allclose(y1, y2)) + # [N, C, L] + shape = [1000000, 4, 4] x = np.random.randn(*shape).astype("float32") - y1, g1 = compute_v1(x) - y2, g2 = compute_v2(x) + y1, g1 = compute_baseline(x) + y2, g2 = compute_1d(x) self.assertTrue(np.allclose(g1, g2)) self.assertTrue(np.allclose(y1, y2)) - def test_eager_api_1d(self): + def test_eager_api(self): places = [fluid.CPUPlace()] if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: - shape = [200000, 4] + shape = [4, 10, 4, 4] def compute_v1(x): with fluid.dygraph.guard(p): bn = fluid.dygraph.BatchNorm(shape[1]) + #bn = paddle.nn.BatchNorm2D(shape[1]) x1 = paddle.to_tensor(x) x1.stop_gradient = False y = bn(x1) @@ -135,7 +143,8 @@ def compute_v1(x): def compute_v2(x): with fluid.dygraph.guard(p): with _test_eager_guard(): - bn = paddle.nn.BatchNorm1D(shape[1]) + print("v2") + bn = paddle.nn.BatchNorm2D(shape[1]) x1 = paddle.to_tensor(x) x1.stop_gradient = False y = bn(x1) From e9b4d0befebd50796af85ec411e37441b0043921 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 14 Jul 2022 17:47:13 +0800 Subject: [PATCH 208/250] [Phi]Improve the mechanism for mkldnn kernel in PHI (#43941) * adapt mkldnn kernel in PHI * fix ci compile bugs * fix compile bugs * fix compile bugs * fix compile bugs * fix compile bugs * delete comment * fix compile bugs in windows-inference * delete code for converage * modify code by review * modify code by review * add todo * fix compile bugs * fix compile bugs * fix compile bugs * fix unittest bugsx --- cmake/phi.cmake | 3 + paddle/fluid/framework/operator.cc | 42 ++- paddle/fluid/framework/phi_utils.cc | 4 +- paddle/fluid/framework/phi_utils_test.cc | 4 +- .../operators/mkldnn/activation_mkldnn_op.cc | 7 - .../operators/mkldnn/batch_norm_mkldnn_op.cc | 7 - paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 7 - .../operators/mkldnn/log_softmax_mkldnn_op.cc | 76 ---- .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 7 - .../fluid/operators/mkldnn/sum_mkldnn_op.cc | 7 - paddle/fluid/platform/CMakeLists.txt | 8 +- paddle/fluid/platform/device_context.cc | 269 --------------- paddle/fluid/platform/device_context.h | 129 +------ paddle/phi/api/lib/kernel_dispatch.cc | 2 +- paddle/phi/backends/CMakeLists.txt | 4 + paddle/phi/backends/cpu/cpu_context.cc | 7 +- paddle/phi/backends/onednn/CMakeLists.txt | 6 + paddle/phi/backends/onednn/onednn_context.cc | 326 ++++++++++++++++++ paddle/phi/backends/onednn/onednn_context.h | 143 ++++++++ paddle/phi/common/backend.h | 10 +- paddle/phi/core/compat/convert_utils.cc | 2 +- paddle/phi/core/kernel_factory.cc | 8 + paddle/phi/core/kernel_registry.h | 4 + paddle/phi/core/kernel_utils.h | 5 +- paddle/phi/kernels/CMakeLists.txt | 18 +- paddle/phi/kernels/cpu/log_softmax_kernel.cc | 2 + .../phi/kernels/onednn/log_softmax_kernel.cc | 72 ++++ paddle/phi/tests/common/test_backend.cc | 6 +- 28 files changed, 641 insertions(+), 544 deletions(-) delete mode 100644 paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc create mode 100644 paddle/phi/backends/onednn/CMakeLists.txt create mode 100644 paddle/phi/backends/onednn/onednn_context.cc create mode 100644 paddle/phi/backends/onednn/onednn_context.h create mode 100644 paddle/phi/kernels/onednn/log_softmax_kernel.cc diff --git a/cmake/phi.cmake b/cmake/phi.cmake index 9f716969dcdec..e320473d9be2f 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -103,6 +103,9 @@ function(kernel_declare TARGET_LIST) elseif(${kernel_path} MATCHES "./kps\/") file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n") + elseif(${kernel_path} MATCHES "./onednn\/") + file(APPEND ${kernel_declare_file} + "PD_DECLARE_KERNEL(${kernel_name}, OneDNN, ALL_LAYOUT);\n") else() # deal with device independent kernel, now we use CPU temporaary file(APPEND ${kernel_declare_file} diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 83521be98fc17..4f50996267b97 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1276,24 +1276,32 @@ bool OperatorWithKernel::SupportNPU() const { bool OperatorWithKernel::SupportsMKLDNN( const proto::VarType::Type data_type) const { - auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_); - if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) { - VLOG(6) << "Warning: " << type_ - << " don't find its MKLDNN Kernel in Fluid " - "Registered Kernels. And We don't " - "search its kernels in phi lib, " - "SupportsMKLDNN() return false."; - return false; + auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( + phi::TransToPhiKernelName(type_)); + auto has_phi_kernel = + std::any_of(phi_kernels.begin(), + phi_kernels.end(), + [](phi::KernelKeyMap::const_reference kern_pair) { + return kern_pair.first.backend() == phi::Backend::ONEDNN; + }); + if (has_phi_kernel) { + return true; + } else { + auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_); + if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) { + return false; + } else { + auto& op_kernels = op_kernel_iter->second; + return std::any_of( + op_kernels.begin(), + op_kernels.end(), + [data_type](OpKernelMap::const_reference kern_pair) { + return platform::is_cpu_place(kern_pair.first.place_) && + kern_pair.first.library_type_ == LibraryType::kMKLDNN && + kern_pair.first.data_type_ == data_type; + }); + } } - auto& op_kernels = op_kernel_iter->second; - return std::any_of(op_kernels.begin(), - op_kernels.end(), - [data_type](OpKernelMap::const_reference kern_pair) { - return platform::is_cpu_place(kern_pair.first.place_) && - kern_pair.first.library_type_ == - LibraryType::kMKLDNN && - kern_pair.first.data_type_ == data_type; - }); } bool OperatorWithKernel::SupportsKernelType( diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index fada192e55e14..2e56fea28e0b5 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -66,7 +66,7 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) { platform::Place place = phi::TransToPhiPlace(kernel_key.backend(), false); DataLayout data_layout = kernel_key.layout(); LibraryType library_type = LibraryType::kPlain; - if (kernel_key.backend() == phi::Backend::MKLDNN) { + if (kernel_key.backend() == phi::Backend::ONEDNN) { library_type = LibraryType::kMKLDNN; } else if (kernel_key.backend() == phi::Backend::GPUDNN) { library_type = LibraryType::kCUDNN; @@ -87,7 +87,7 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey( backend = phi::Backend::GPUDNN; break; case LibraryType::kMKLDNN: - backend = phi::Backend::MKLDNN; + backend = phi::Backend::ONEDNN; break; case LibraryType::kKP: backend = phi::Backend::KPS; diff --git a/paddle/fluid/framework/phi_utils_test.cc b/paddle/fluid/framework/phi_utils_test.cc index e8f8825006094..94ab77f310f99 100644 --- a/paddle/fluid/framework/phi_utils_test.cc +++ b/paddle/fluid/framework/phi_utils_test.cc @@ -32,7 +32,7 @@ TEST(PhiUtils, TransPhiKernelKeyToOpKernelType) { #ifdef PADDLE_WITH_MKLDNN phi::KernelKey kernel_key_mkldnn( - phi::Backend::MKLDNN, phi::DataLayout::NCHW, phi::DataType::FLOAT32); + phi::Backend::ONEDNN, phi::DataLayout::NCHW, phi::DataType::FLOAT32); op_kernel_type = paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key_mkldnn); ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32); @@ -76,7 +76,7 @@ TEST(PhiUtils, TransOpKernelTypeToPhiKernelKey) { paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_mkldnn); ASSERT_EQ(kernel_key_mkldnn.dtype(), phi::DataType::FLOAT32); ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::MKLDNN); - ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::MKLDNN); + ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::ONEDNN); #endif #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index c9b4514995290..eb0d03ce00a97 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -20,13 +20,6 @@ namespace phi { class DenseTensor; } // namespace phi -namespace paddle { -namespace framework {} // namespace framework -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index ee83ffffd9786..f41068dd5f1ae 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -19,13 +19,6 @@ namespace phi { class DenseTensor; } // namespace phi -namespace paddle { -namespace framework {} // namespace framework -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index 8cfbc95be7a1a..7404972ea7cca 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -21,13 +21,6 @@ namespace phi { class DenseTensor; } // namespace phi -namespace paddle { -namespace framework {} // namespace framework -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc deleted file mode 100644 index b8ca40a0309e6..0000000000000 --- a/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/platform/mkldnn_reuse.h" - -namespace paddle { -namespace operators { - -using framework::Tensor; - -template -class LogSoftmaxMKLDNNHandler - : public platform::MKLDNNHandlerNoCachingT { - public: - LogSoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine, - platform::Place cpu_place, - const Tensor* x, - const int axis) - : platform::MKLDNNHandlerNoCachingT( - mkldnn_engine, cpu_place) { - this->AcquireForwardPrimitiveDescriptor( - dnnl::prop_kind::forward_inference, x->mem_desc(), axis); - } -}; - -template -class LogSoftmaxMKLDNNKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = - ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - const Tensor* x = ctx.Input("X"); - Tensor* out = ctx.Output("Out"); - - int axis = ctx.Attr("axis"); - axis = axis >= 0 ? axis : x->dims().size() + axis; - - LogSoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), x, axis); - - auto src_memory_p = handler.AcquireSrcMemory(x); - auto dst_memory_p = handler.AcquireDstMemory(out); - - auto logsoftmax_p = handler.AcquireForwardPrimitive(); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - logsoftmax_p->execute( - astream, - {{DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}); - astream.wait(); - - out->set_mem_desc(dst_memory_p->get_desc()); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_KERNEL(log_softmax, - MKLDNN, - ::paddle::platform::CPUPlace, - ops::LogSoftmaxMKLDNNKernel, - ops::LogSoftmaxMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 813ebb2c81ce9..ec341c30773e8 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -21,13 +21,6 @@ namespace phi { class DenseTensor; } // namespace phi -namespace paddle { -namespace framework {} // namespace framework -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index 2202349bd66c1..f71785e72cd4d 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -31,13 +31,6 @@ namespace phi { class DenseTensor; } // namespace phi -namespace paddle { -namespace framework {} // namespace framework -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index b00e4056259d9..2ff31aa5b54fc 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -195,6 +195,7 @@ cc_library( # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies + cc_library( device_context SRCS device_context.cc @@ -219,12 +220,17 @@ cc_library( ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 - cpu_context generator) if(WITH_XPU) target_link_libraries(device_context xpu_context xpu_resource_pool) endif() +if(WITH_MKLDNN) + target_link_libraries(device_context onednn_context) +endif() + +target_link_libraries(device_context cpu_context) + cc_library( collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 6bceb696c0f8e..1e978f078dc84 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -753,275 +753,6 @@ Eigen::DefaultDevice* CUDAPinnedDeviceContext::eigen_device() const { const Place& CUDAPinnedDeviceContext::GetPlace() const { return place_; } #endif -#ifdef PADDLE_WITH_MKLDNN -MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) - : phi::CPUContext(place), p_blobmap_() { - p_blobmap_.reset(new BlobMap()); - p_exec_items_.reset(new ExecShape()); - p_mutex_.reset(new std::mutex()); -} - -MKLDNNDeviceContextThreadLocals::Body::Body() - : cur_engine(dnnl::engine::kind::cpu, 0), cur_stream(cur_engine) { - cur_mkldnn_session_id = kMKLDNNSessionID_Default; - cur_input_shape_str = ""; - cur_input_shape_cache_capacity = 1; - cur_paddle_data_layout = paddle::framework::DataLayout::kNCHW; -} - -// When Thread finish we clear oneDNN cache -// This is needed when we have one executor used by many threads -// e.g. test_analyzer_detect. Thread ID is not part of caching key -// (for naive executor) so we need to clear cache when one thread finish -// and other is to start inference -// TODO(jczaja): Ideally it would be good to clear only part of cache -// related to thread that is to be terminated -MKLDNNDeviceContextThreadLocals::Body::~Body() { - auto cpu_place = paddle::platform::CPUPlace(); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::MKLDNNDeviceContext* dev_ctx = - (platform::MKLDNNDeviceContext*)pool.Get(cpu_place); - dev_ctx->ResetBlobMap(exec_ptr_); -} - -void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id( - size_t sid) { - cur_mkldnn_session_id = sid; -} -size_t MKLDNNDeviceContextThreadLocals::Body::get_cur_mkldnn_session_id(void) { - return cur_mkldnn_session_id; -} - -void MKLDNNDeviceContextThreadLocals::Body::set_cur_input_shape_str( - std::string input_shape_str) { - cur_input_shape_str = input_shape_str; -} -void MKLDNNDeviceContextThreadLocals::Body::set_cur_input_shape_cache_capacity( - int input_shape_cache_capacity) { - cur_input_shape_cache_capacity = input_shape_cache_capacity; -} - -void MKLDNNDeviceContextThreadLocals::Body::set_cur_paddle_data_layout( - framework::DataLayout dl) { - cur_paddle_data_layout = dl; -} - -framework::DataLayout -MKLDNNDeviceContextThreadLocals::Body::get_cur_paddle_data_layout(void) { - return cur_paddle_data_layout; -} - -void MKLDNNDeviceContextThreadLocals::Body::log_lib_version(void) { - if (!said_once) { - said_once = true; - auto dv = dnnl::version(); - LOG(INFO) << "oneDNN v" << dv->major << "." << dv->minor << "." - << dv->patch; - } -} - -const dnnl::engine& MKLDNNDeviceContextThreadLocals::Body::get_engine(void) { - return cur_engine; -} - -dnnl::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) { - return cur_stream; -} - -void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { - VLOG(4) << tls().get_curr_exec() << " " << ptr; - std::lock_guard lock(*p_mutex_); - if (block_next_cache_clearing_ == 0) { - VLOG(3) << "Clearing DNNL cache."; - // If no specific executor pointer then clear - // everything. For executor pointer then clear only - // objects allocated when using given executor - if (ptr == nullptr) { - p_blobmap_->clear(); - } else { - // Iterate through all shapes and release - // for each shape and active executor all entries - // of this executor - for (auto& s : *p_exec_items_) { - for (auto& v : (*s.second)[ptr]) { - (v.first)->erase(v.second); - } - s.second->erase(ptr); - } - } - // Reset paddle layout to NCHW - VLOG(3) << "Resetting Paddle data layout to NCHW."; - platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout( - paddle::framework::DataLayout::kNCHW); - } else { - --block_next_cache_clearing_; - VLOG(3) << "Prevented Clearing DNNL cache. Updated " - "block_next_cache_clearing_ : " - << block_next_cache_clearing_; - PADDLE_ENFORCE_GE(block_next_cache_clearing_, - 0, - platform::errors::InvalidArgument( - "Cache clearing mark should be non-negative " - ". But received %d.", - block_next_cache_clearing_)); - } -} - -void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const { - p_exec_items_->erase(p_exec_items_->begin()); -} - -void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t pblob, - KeyBlob::iterator it) const { - // Take current input shape from TLS - // Take current executor addess from TLS - // and for this executor's items add the one defined with arguments - auto key_it = p_exec_items_ - ->insert(std::make_pair(tls().cur_input_shape_str, - std::make_shared())) - .first; - (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it)); - - VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size() - << " curr exec size: " - << (*key_it->second)[tls().get_curr_exec()].size() << "\n"; -} - -void MKLDNNDeviceContext::BlockNextCacheClearing() { - std::lock_guard lock(*p_mutex_); - ++block_next_cache_clearing_; - VLOG(3) << "Next DNNL cache clearing has been blocked. Updated " - "block_next_cache_clearing_ : " - << block_next_cache_clearing_; -} - -size_t MKLDNNDeviceContext::GetShapeBlobSize() const { - std::lock_guard lock(*p_mutex_); - BlobMap* pMap = p_blobmap_.get(); - auto map_it = pMap->find(tls().cur_mkldnn_session_id); - if (map_it == pMap->end()) { - PADDLE_THROW(platform::errors::NotFound( - "MKLDNNDeviceContext don't find cur_mkldnn_session_id: %d.", - tls().cur_mkldnn_session_id)); - } - return map_it->second->size(); -} - -void MKLDNNDeviceContext::SetBlob(const std::string& name, - BlobPtr_t data) const { - BlobMap* pMap = p_blobmap_.get(); - BlobPtr_t sBlob = nullptr; - BlobPtr_t pBlob = nullptr; - - int sid = tls().get_cur_mkldnn_session_id(); - - std::lock_guard lock(*p_mutex_); - - // Find ShapeBlob for current mkldnn session id. - auto map_it = pMap->find(sid); - - if (map_it == pMap->end()) { - // 1st time to set blob in current thread - sBlob = std::make_shared(); - (*pMap)[sid] = sBlob; - VLOG(2) << "SetBlob: sid=" << sid << ", add new sid\n"; - } else { - sBlob = map_it->second; - } - - // Find KeyBlob for current input shape - auto key_it = sBlob->find(tls().cur_input_shape_str); - - if (key_it == sBlob->end()) { - // In cache clearing mode, cur_input_shape_cache_capacity defines - // max pblob capacity - if ((static_cast(sid) == - MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_CacheClearing) && - sBlob->size() && - (sBlob->size() >= - static_cast(tls().cur_input_shape_cache_capacity))) { - VLOG(2) << "sid=" << sid - << ", remove all blobs of shape: " << sBlob->begin()->first; - sBlob->erase(sBlob->begin()->first); - RemoveShapeEntriesWithExecutor(); - } - pBlob = std::make_shared(); - (*sBlob)[tls().cur_input_shape_str] = pBlob; - } else { - pBlob = key_it->second; - } - - // Find Blob via name - auto blob_it = pBlob->find(name); - if (blob_it == pBlob->end()) { - auto el = - pBlob->insert(std::make_pair(name, data)); // (*pBlob)[name] = data; - // Register new element in per executor map - // to have easily erased when executor terminated - LinkEntryWithExecutor(pBlob, el.first); - } else { - blob_it->second = data; // set data to existing blob - } - VLOG(2) << "SetBlob: sid=" << sid << ", add blob=" << name << "\n"; - // lock will be automatically released when out of scope - return; -} - -unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const { - unsigned int num_entries = 0; - for (auto const& l3 : *p_blobmap_) { - for (auto const& l2 : *(l3.second)) { - num_entries += (l2.second)->size(); - } - } - return num_entries; -} - -MKLDNNDeviceContext::BlobPtr_t MKLDNNDeviceContext::GetBlob( - const std::string& name) const { - BlobMap* pMap = p_blobmap_.get(); - BlobPtr_t sBlob = nullptr; - BlobPtr_t pBlob = nullptr; - - int sid = tls().get_cur_mkldnn_session_id(); - - std::lock_guard lock(*p_mutex_); - - // Find ShapeBlob for current mkldnn session id firstly - auto map_it = pMap->find(sid); - // (jczaja): After first iteration of model's execution we - // should have all elements cached (mostly) so failures are unlikely (less - // likely for dynamic shapes) - if (unlikely(map_it == pMap->end())) { - VLOG(2) << "GetBlob: sid=" << sid << ", miss sid\n"; - return nullptr; - } - sBlob = map_it->second; - - // Find KeyBlob for current input shape secondly - auto sBlob_it = sBlob->find(tls().cur_input_shape_str); - if (unlikely(sBlob_it == sBlob->end())) { - VLOG(2) << "GetBlob: sid=" << tls().cur_input_shape_str - << ", miss input_shape_str\n"; - return nullptr; - } - pBlob = sBlob_it->second; - - // Find Blob via name - auto key_it = pBlob->find(name); - - if (unlikely(key_it == pBlob->end())) { - VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n"; - return nullptr; - } - - VLOG(2) << "GetBlob sid=" << sid << ", get blob=" << name << "\n"; - // lock will be automatically released when out of scope - return key_it->second; -} - -#endif - #ifdef PADDLE_WITH_CUSTOM_DEVICE CustomDeviceContext::CustomDeviceContext(CustomPlace place) : phi::CustomContext(place) { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 4459c913f005d..d0443e30cf9c6 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -59,6 +59,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_MKLDNN #include "dnnl.hpp" // NOLINT #include "paddle/fluid/framework/data_layout.h" +#include "paddle/phi/backends/onednn/onednn_context.h" #endif #include @@ -716,132 +717,8 @@ struct DefaultDeviceContextType { #endif #ifdef PADDLE_WITH_MKLDNN - -class MKLDNNDeviceContextThreadLocals { - // default mkldnn session id - - typedef MKLDNNDeviceContextThreadLocals self; - struct Body { - bool said_once = false; - size_t cur_mkldnn_session_id; - // Current data input shape string. - // - For fixed-shape, it's a null string in default. - // - For dynamic-shape, it's user specific. - std::string cur_input_shape_str; - // the cache capacity of different input shapes for MKLDNN. - // Default 1 means fixed input shape, not dynamic shape. - int cur_input_shape_cache_capacity; - // Recently registered data_format. This is needed to - // know for converting MKL-DNN Tensor to non MKL-DNN - paddle::framework::DataLayout cur_paddle_data_layout; - // MKL-DNN stream used for execution of primitives (per-thread) - dnnl::engine cur_engine; - dnnl::stream cur_stream; - std::string key_suffix; // Key identifying current Executor - bool key_attach_thread_id = true; - void* exec_ptr_ = nullptr; - - Body(); - ~Body(); - void set_cur_mkldnn_session_id(size_t sid); - size_t get_cur_mkldnn_session_id(void); - void set_cur_input_shape_str(std::string input_shape_str); - void set_cur_input_shape_cache_capacity(int input_shape_cache_capacity); - void set_cur_paddle_data_layout(framework::DataLayout dl); - framework::DataLayout get_cur_paddle_data_layout(void); - void log_lib_version(void); - const dnnl::engine& get_engine(void); - dnnl::stream& get_stream(void); - void set_key_suffix(const std::string& suffix) { key_suffix = suffix; } - const std::string& get_key_suffix(void) const { return key_suffix; } - void disable_tid_in_key(void) { key_attach_thread_id = false; } - bool is_tid_used_in_key(void) const { return key_attach_thread_id; } - void set_curr_exec(void* exec_ptr) { exec_ptr_ = exec_ptr; } - void* get_curr_exec(void) const { return exec_ptr_; } - }; - MKLDNNDeviceContextThreadLocals() = default; - MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) = - delete; - - public: - // default mkldnn session id - static constexpr size_t kMKLDNNSessionID_Default = 0; - // mkldnn session id for cache clearing mode - static constexpr size_t kMKLDNNSessionID_CacheClearing = -1; - static Body& fetch() { - thread_local Body b; - return b; - } -}; - -class MKLDNNDeviceContext : public phi::CPUContext { - public: - template - using BlobPtr_t = std::shared_ptr; - template - using umap_value_smart_t = std::unordered_map>; - template - using umap_key_string_t = umap_value_smart_t; - - // Following three maps are used to cache MKLDNN primitives. - // There relations are: - // - BlobMap = Map - // - ShapeBlob = Map - // - KeyBlob = Map - - using KeyBlob = umap_key_string_t; - using ShapeBlob = umap_key_string_t; - using BlobMap = umap_value_smart_t; - - // Auxillary two-level structure (shape, executor) to easier control - // clearing cache objects related to specific executor - - using ExecKey = void*; - using ExecMapCacheIterPair = std::pair, KeyBlob::iterator>; - using ExecMap = - std::unordered_map>; - using ExecShape = std::unordered_map>; - - explicit MKLDNNDeviceContext(CPUPlace place); - - /* \brief Get the active engine */ - const dnnl::engine& GetEngine() const { return tls().get_engine(); } - - // Register object to currently used executor's map - void LinkEntryWithExecutor(BlobPtr_t, KeyBlob::iterator) const; - void RemoveShapeEntriesWithExecutor(void) const; - - // Remove all entries from the blob map - void ResetBlobMap(void* ptr); - - // Prevent next ResetBlobMap() - void BlockNextCacheClearing(); - - // Get the ShapeBlob size in cur_mkldnn_session_id. - size_t GetShapeBlobSize() const; - - // Set data to blob (i.e. name/data pair). Create blob if not existing - void SetBlob(const std::string& name, std::shared_ptr data) const; - - // Calculate number of oneDNN objects cached - unsigned int GetCachedObjectsNumber(void) const; - - // Find a saved blob. Return nullptr if not found - std::shared_ptr GetBlob(const std::string& name) const; - - static auto tls() -> decltype(MKLDNNDeviceContextThreadLocals::fetch()) { - return MKLDNNDeviceContextThreadLocals::fetch(); - } - - private: - std::shared_ptr p_blobmap_; - // Map key is pointer of executor and value is a data(iterator in map) needed - // to erase - std::shared_ptr p_exec_items_; - std::shared_ptr p_mutex_; - // 0 - clearing is allowed. x > 0 do not clear. - unsigned int block_next_cache_clearing_ = 0; -}; +using MKLDNNDeviceContextThreadLocals = phi::OneDNNContextThreadLocals; +using MKLDNNDeviceContext = phi::OneDNNContext; #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc index a534f02663dff..27e88d217971b 100644 --- a/paddle/phi/api/lib/kernel_dispatch.cc +++ b/paddle/phi/api/lib/kernel_dispatch.cc @@ -57,7 +57,7 @@ BackendSet GetTensorBackendSet(const phi::TensorBase& t) { BackendSet backend_set(phi::TransToPhiBackend(t.place())); switch (t.layout()) { case DataLayout::MKLDNN: - backend_set = backend_set | BackendSet(Backend::MKLDNN); + backend_set = backend_set | BackendSet(Backend::ONEDNN); break; default: // do nothing diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index c981b625192da..50367a32b02b8 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -12,6 +12,10 @@ if(WITH_XPU) add_subdirectory(xpu) endif() +if(WITH_MKLDNN) + add_subdirectory(onednn) +endif() + cc_library( phi_context SRCS all_context.cc diff --git a/paddle/phi/backends/cpu/cpu_context.cc b/paddle/phi/backends/cpu/cpu_context.cc index 63b5d82f3bdd0..d42189e00eeb8 100644 --- a/paddle/phi/backends/cpu/cpu_context.cc +++ b/paddle/phi/backends/cpu/cpu_context.cc @@ -14,8 +14,8 @@ #include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/api/ext/exception.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" // NOTE: The paddle framework should add WITH_EIGEN option to support compile // without eigen. @@ -41,7 +41,10 @@ struct CPUContext::Impl { } Eigen::DefaultDevice* GetEigenDevice() const { - PD_CHECK(eigen_device_ != nullptr, "the cpu eigen_device is nullptr."); + PADDLE_ENFORCE_NE( + eigen_device_, + nullptr, + phi::errors::Unavailable("the cpu eigen_device is nullptr.")); return eigen_device_; } diff --git a/paddle/phi/backends/onednn/CMakeLists.txt b/paddle/phi/backends/onednn/CMakeLists.txt new file mode 100644 index 0000000000000..a65d6b002f5f7 --- /dev/null +++ b/paddle/phi/backends/onednn/CMakeLists.txt @@ -0,0 +1,6 @@ +if(WITH_MKLDNN) + cc_library( + onednn_context + SRCS onednn_context.cc + DEPS cpu_context mkldnn) +endif() diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc new file mode 100644 index 0000000000000..950483a469ed8 --- /dev/null +++ b/paddle/phi/backends/onednn/onednn_context.cc @@ -0,0 +1,326 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/phi/backends/onednn/onednn_context.h" + +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/enforce.h" + +#include "paddle/fluid/framework/expect.h" +#include "paddle/fluid/platform/device_context.h" + +namespace phi { + +OneDNNContextThreadLocals::Body::Body() + : cur_engine(dnnl::engine::kind::cpu, 0), cur_stream(cur_engine) { + cur_mkldnn_session_id = kMKLDNNSessionID_Default; + cur_input_shape_str = ""; + cur_input_shape_cache_capacity = 1; + cur_paddle_data_layout = DataLayout::kNCHW; +} + +// When Thread finish we clear oneDNN cache +// This is needed when we have one executor used by many threads +// e.g. test_analyzer_detect. Thread ID is not part of caching key +// (for naive executor) so we need to clear cache when one thread finish +// and other is to start inference +// TODO(jczaja): Ideally it would be good to clear only part of cache +// related to thread that is to be terminated +OneDNNContextThreadLocals::Body::~Body() { + auto cpu_place = phi::CPUPlace(); + // TODO(YuanRisheng): we need remove the dependency on fluid device context + // here + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + OneDNNContext* dev_ctx = static_cast(pool.Get(cpu_place)); + dev_ctx->ResetBlobMap(exec_ptr_); +} + +void OneDNNContextThreadLocals::Body::set_cur_mkldnn_session_id(size_t sid) { + cur_mkldnn_session_id = sid; +} +size_t OneDNNContextThreadLocals::Body::get_cur_mkldnn_session_id(void) { + return cur_mkldnn_session_id; +} + +void OneDNNContextThreadLocals::Body::set_cur_input_shape_str( + std::string input_shape_str) { + cur_input_shape_str = input_shape_str; +} +void OneDNNContextThreadLocals::Body::set_cur_input_shape_cache_capacity( + int input_shape_cache_capacity) { + cur_input_shape_cache_capacity = input_shape_cache_capacity; +} + +void OneDNNContextThreadLocals::Body::set_cur_paddle_data_layout( + DataLayout dl) { + cur_paddle_data_layout = dl; +} + +DataLayout OneDNNContextThreadLocals::Body::get_cur_paddle_data_layout(void) { + return cur_paddle_data_layout; +} + +void OneDNNContextThreadLocals::Body::log_lib_version(void) { + if (!said_once) { + said_once = true; + auto dv = dnnl::version(); + LOG(INFO) << "oneDNN v" << dv->major << "." << dv->minor << "." + << dv->patch; + } +} + +struct OneDNNContext::Impl { + Impl() : p_blobmap_() { + p_blobmap_.reset(new BlobMap()); + p_exec_items_.reset(new ExecShape()); + p_mutex_.reset(new std::mutex()); + } + + ~Impl() {} + + void ResetBlobMap(void* ptr) { + VLOG(4) << OneDNNContext::tls().get_curr_exec() << " " << ptr; + std::lock_guard lock(*p_mutex_); + if (block_next_cache_clearing_ == 0) { + VLOG(3) << "Clearing DNNL cache."; + // If no specific executor pointer then clear + // everything. For executor pointer then clear only + // objects allocated when using given executor + if (ptr == nullptr) { + p_blobmap_->clear(); + } else { + // Iterate through all shapes and release + // for each shape and active executor all entries + // of this executor + for (auto& s : *p_exec_items_) { + for (auto& v : (*s.second)[ptr]) { + (v.first)->erase(v.second); + } + s.second->erase(ptr); + } + } + // Reset paddle layout to NCHW + VLOG(3) << "Resetting Paddle data layout to NCHW."; + OneDNNContext::tls().set_cur_paddle_data_layout(DataLayout::kNCHW); + } else { + --block_next_cache_clearing_; + VLOG(3) << "Prevented Clearing DNNL cache. Updated " + "block_next_cache_clearing_ : " + << block_next_cache_clearing_; + PADDLE_ENFORCE_GE(block_next_cache_clearing_, + 0, + phi::errors::InvalidArgument( + "Cache clearing mark should be non-negative " + ". But received %d.", + block_next_cache_clearing_)); + } + } + + // Register object to currently used executor's map + void LinkEntryWithExecutor(BlobPtr_t pblob, + KeyBlob::iterator it) const { + // Take current input shape from TLS + // Take current executor addess from TLS + // and for this executor's items add the one defined with arguments + auto key_it = + p_exec_items_ + ->insert(std::make_pair(OneDNNContext::tls().cur_input_shape_str, + std::make_shared())) + .first; + (*key_it->second)[OneDNNContext::tls().get_curr_exec()].push_back( + std::make_pair(pblob, it)); + + VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size() + << " curr exec size: " + << (*key_it->second)[OneDNNContext::tls().get_curr_exec()].size() + << "\n"; + } + + void RemoveShapeEntriesWithExecutor() const { + p_exec_items_->erase(p_exec_items_->begin()); + } + + void BlockNextCacheClearing() { + std::lock_guard lock(*p_mutex_); + ++block_next_cache_clearing_; + VLOG(3) << "Next DNNL cache clearing has been blocked. Updated " + "block_next_cache_clearing_ : " + << block_next_cache_clearing_; + } + + size_t GetShapeBlobSize() const { + std::lock_guard lock(*p_mutex_); + BlobMap* pMap = p_blobmap_.get(); + auto map_it = pMap->find(OneDNNContext::tls().cur_mkldnn_session_id); + if (map_it == pMap->end()) { + PADDLE_THROW(phi::errors::NotFound( + "OneDNNContext don't find cur_mkldnn_session_id: %d.", + OneDNNContext::tls().cur_mkldnn_session_id)); + } + return map_it->second->size(); + } + + void SetBlob(const std::string& name, BlobPtr_t data) const { + BlobMap* pMap = p_blobmap_.get(); + BlobPtr_t sBlob = nullptr; + BlobPtr_t pBlob = nullptr; + + int sid = OneDNNContext::tls().get_cur_mkldnn_session_id(); + + std::lock_guard lock(*p_mutex_); + + // Find ShapeBlob for current mkldnn session id. + auto map_it = pMap->find(sid); + + if (map_it == pMap->end()) { + // 1st time to set blob in current thread + sBlob = std::make_shared(); + (*pMap)[sid] = sBlob; + VLOG(2) << "SetBlob: sid=" << sid << ", add new sid\n"; + } else { + sBlob = map_it->second; + } + + // Find KeyBlob for current input shape + auto key_it = sBlob->find(OneDNNContext::tls().cur_input_shape_str); + + if (key_it == sBlob->end()) { + // In cache clearing mode, cur_input_shape_cache_capacity defines + // max pblob capacity + if ((static_cast(sid) == + OneDNNContextThreadLocals::kMKLDNNSessionID_CacheClearing) && + sBlob->size() && + (sBlob->size() >= + static_cast( + OneDNNContext::tls().cur_input_shape_cache_capacity))) { + VLOG(2) << "sid=" << sid + << ", remove all blobs of shape: " << sBlob->begin()->first; + sBlob->erase(sBlob->begin()->first); + RemoveShapeEntriesWithExecutor(); + } + pBlob = std::make_shared(); + (*sBlob)[OneDNNContext::tls().cur_input_shape_str] = pBlob; + } else { + pBlob = key_it->second; + } + + // Find Blob via name + auto blob_it = pBlob->find(name); + if (blob_it == pBlob->end()) { + auto el = + pBlob->insert(std::make_pair(name, data)); // (*pBlob)[name] = data; + // Register new element in per executor map + // to have easily erased when executor terminated + LinkEntryWithExecutor(pBlob, el.first); + } else { + blob_it->second = data; // set data to existing blob + } + VLOG(2) << "SetBlob: sid=" << sid << ", add blob=" << name << "\n"; + // lock will be automatically released when out of scope + return; + } + + unsigned int GetCachedObjectsNumber(void) const { + unsigned int num_entries = 0; + for (auto const& l3 : *p_blobmap_) { + for (auto const& l2 : *(l3.second)) { + num_entries += (l2.second)->size(); + } + } + return num_entries; + } + + OneDNNContext::BlobPtr_t GetBlob(const std::string& name) const { + BlobMap* pMap = p_blobmap_.get(); + BlobPtr_t sBlob = nullptr; + BlobPtr_t pBlob = nullptr; + + int sid = OneDNNContext::tls().get_cur_mkldnn_session_id(); + + std::lock_guard lock(*p_mutex_); + + // Find ShapeBlob for current mkldnn session id firstly + auto map_it = pMap->find(sid); + // (jczaja): After first iteration of model's execution we + // should have all elements cached (mostly) so failures are unlikely (less + // likely for dynamic shapes) + if (unlikely(map_it == pMap->end())) { + VLOG(2) << "GetBlob: sid=" << sid << ", miss sid\n"; + return nullptr; + } + sBlob = map_it->second; + + // Find KeyBlob for current input shape secondly + auto sBlob_it = sBlob->find(OneDNNContext::tls().cur_input_shape_str); + if (unlikely(sBlob_it == sBlob->end())) { + VLOG(2) << "GetBlob: sid=" << OneDNNContext::tls().cur_input_shape_str + << ", miss input_shape_str\n"; + return nullptr; + } + pBlob = sBlob_it->second; + + // Find Blob via name + auto key_it = pBlob->find(name); + + if (unlikely(key_it == pBlob->end())) { + VLOG(2) << "GetBlob sid=" << sid << ", miss blob=" << name << "\n"; + return nullptr; + } + + VLOG(2) << "GetBlob sid=" << sid << ", get blob=" << name << "\n"; + // lock will be automatically released when out of scope + return key_it->second; + } + + std::shared_ptr p_blobmap_; + // Map key is pointer of executor and value is a data(iterator in map) needed + // to erase + std::shared_ptr p_exec_items_; + std::shared_ptr p_mutex_; + // 0 - clearing is allowed. x > 0 do not clear. + unsigned int block_next_cache_clearing_ = 0; +}; + +OneDNNContext::OneDNNContext(const Place& place) + : CPUContext(place), impl_(std::make_unique()) {} + +OneDNNContext::~OneDNNContext() = default; + +void OneDNNContext::ResetBlobMap(void* ptr) { impl_->ResetBlobMap(ptr); } + +void OneDNNContext::BlockNextCacheClearing() { + impl_->BlockNextCacheClearing(); +} + +size_t OneDNNContext::GetShapeBlobSize() const { + return impl_->GetShapeBlobSize(); +} + +void OneDNNContext::SetBlob(const std::string& name, + BlobPtr_t data) const { + impl_->SetBlob(name, data); +} + +unsigned int OneDNNContext::GetCachedObjectsNumber(void) const { + return impl_->GetCachedObjectsNumber(); +} + +OneDNNContext::BlobPtr_t OneDNNContext::GetBlob( + const std::string& name) const { + return impl_->GetBlob(name); +} + +} // namespace phi +#endif diff --git a/paddle/phi/backends/onednn/onednn_context.h b/paddle/phi/backends/onednn/onednn_context.h new file mode 100644 index 0000000000000..d7cf8a0ff4902 --- /dev/null +++ b/paddle/phi/backends/onednn/onednn_context.h @@ -0,0 +1,143 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_MKLDNN +#include +#include // NOLINT +#include "dnnl.hpp" // NOLINT +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/layout.h" +#include "paddle/phi/common/place.h" + +namespace phi { + +class OneDNNContextThreadLocals { + // default mkldnn session id + + typedef OneDNNContextThreadLocals self; + struct Body { + bool said_once = false; + size_t cur_mkldnn_session_id; + // Current data input shape string. + // - For fixed-shape, it's a null string in default. + // - For dynamic-shape, it's user specific. + std::string cur_input_shape_str; + // the cache capacity of different input shapes for MKLDNN. + // Default 1 means fixed input shape, not dynamic shape. + int cur_input_shape_cache_capacity; + // Recently registered data_format. This is needed to + // know for converting MKL-DNN Tensor to non MKL-DNN + DataLayout cur_paddle_data_layout; + // MKL-DNN stream used for execution of primitives (per-thread) + dnnl::engine cur_engine; + dnnl::stream cur_stream; + std::string key_suffix; // Key identifying current Executor + bool key_attach_thread_id = true; + void* exec_ptr_ = nullptr; + + Body(); + ~Body(); + void set_cur_mkldnn_session_id(size_t sid); + size_t get_cur_mkldnn_session_id(void); + void set_cur_input_shape_str(std::string input_shape_str); + void set_cur_input_shape_cache_capacity(int input_shape_cache_capacity); + void set_cur_paddle_data_layout(DataLayout dl); + DataLayout get_cur_paddle_data_layout(void); + void log_lib_version(void); + const dnnl::engine& get_engine(void) { return cur_engine; } + dnnl::stream& get_stream(void) { return cur_stream; } + void set_key_suffix(const std::string& suffix) { key_suffix = suffix; } + const std::string& get_key_suffix(void) const { return key_suffix; } + void disable_tid_in_key(void) { key_attach_thread_id = false; } + bool is_tid_used_in_key(void) const { return key_attach_thread_id; } + void set_curr_exec(void* exec_ptr) { exec_ptr_ = exec_ptr; } + void* get_curr_exec(void) const { return exec_ptr_; } + }; + OneDNNContextThreadLocals() = default; + OneDNNContextThreadLocals(const OneDNNContextThreadLocals& c) = delete; + + public: + // default mkldnn session id + static constexpr size_t kMKLDNNSessionID_Default = 0; + // mkldnn session id for cache clearing mode + static constexpr size_t kMKLDNNSessionID_CacheClearing = -1; + static Body& fetch() { + thread_local Body b; + return b; + } +}; + +class OneDNNContext : public CPUContext { + public: + template + using BlobPtr_t = std::shared_ptr; + template + using umap_value_smart_t = std::unordered_map>; + template + using umap_key_string_t = umap_value_smart_t; + + // Following three maps are used to cache MKLDNN primitives. + // There relations are: + // - BlobMap = Map + // - ShapeBlob = Map + // - KeyBlob = Map + + using KeyBlob = umap_key_string_t; + using ShapeBlob = umap_key_string_t; + using BlobMap = umap_value_smart_t; + + // Auxillary two-level structure (shape, executor) to easier control + // clearing cache objects related to specific executor + + using ExecKey = void*; + using ExecMapCacheIterPair = std::pair, KeyBlob::iterator>; + using ExecMap = + std::unordered_map>; + using ExecShape = std::unordered_map>; + + explicit OneDNNContext(const Place& place); + ~OneDNNContext(); + /* \brief Get the active engine */ + const dnnl::engine& GetEngine() const { return tls().get_engine(); } + + // Remove all entries from the blob map + void ResetBlobMap(void* ptr); + + // Prevent next ResetBlobMap() + void BlockNextCacheClearing(); + + // Get the ShapeBlob size in cur_mkldnn_session_id. + size_t GetShapeBlobSize() const; + + // Set data to blob (i.e. name/data pair). Create blob if not existing + void SetBlob(const std::string& name, std::shared_ptr data) const; + + // Calculate number of oneDNN objects cached + unsigned int GetCachedObjectsNumber(void) const; + + // Find a saved blob. Return nullptr if not found + std::shared_ptr GetBlob(const std::string& name) const; + + static auto tls() -> decltype(OneDNNContextThreadLocals::fetch()) { + return OneDNNContextThreadLocals::fetch(); + } + + private: + struct Impl; + std::unique_ptr impl_; +}; + +} // namespace phi +#endif diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 3e1787cb12cfa..c6d49bd5b978b 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -50,7 +50,7 @@ enum class Backend : uint8_t { MLU, // MLU currently does not exist at the same time as CUDA // the third library backend - MKLDNN, + ONEDNN, GPUDNN, // cuDNN and hipDNN // paddle kernel primitives backend @@ -118,8 +118,8 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { case Backend::MLU: os << "MLU"; break; - case Backend::MKLDNN: - os << "MKLDNN"; + case Backend::ONEDNN: + os << "ONEDNN"; break; case Backend::GPUDNN: os << "GPUDNN"; @@ -160,8 +160,8 @@ inline Backend StringToBackend(const char* backend_cstr) { return Backend::NPU; } else if (s == std::string("MLU")) { return Backend::MLU; - } else if (s == std::string("MKLDNN")) { - return Backend::MKLDNN; + } else if (s == std::string("OneDNN")) { + return Backend::ONEDNN; } else if (s == std::string("GPUDNN")) { return Backend::GPUDNN; } else if (s == std::string("KPS")) { diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 18c39bfae1d18..231aaeebaccce 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -66,7 +66,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); #endif #ifdef PADDLE_WITH_MKLDNN - case phi::Backend::MKLDNN: + case phi::Backend::ONEDNN: return phi::CPUPlace(); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc index d864544e10dd8..d6f6e60fe2d3d 100644 --- a/paddle/phi/core/kernel_factory.cc +++ b/paddle/phi/core/kernel_factory.cc @@ -46,9 +46,17 @@ const Kernel& KernelFactory::SelectKernel(const std::string& kernel_name, return empty_kernel; } auto kernel_iter = iter->second.find(kernel_key); + if (kernel_iter == iter->second.end() && + kernel_key.layout() != phi::DataLayout::ALL_LAYOUT) { + phi::KernelKey any_layout_kernel_key( + kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()); + kernel_iter = iter->second.find(any_layout_kernel_key); + } + if (kernel_iter == iter->second.end()) { return empty_kernel; } + return kernel_iter->second; } diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 65f655d50375c..010d5c2e0c379 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -56,6 +56,9 @@ struct KernelArgsParseFunctor { auto args_type = ParseArgType(Indices{}); for (auto arg_type : args_type) { if (arg_type == std::type_index(typeid(const CPUContext&)) +#if defined(PADDLE_WITH_MKLDNN) + || arg_type == std::type_index(typeid(const OneDNNContext&)) +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || arg_type == std::type_index(typeid(const GPUContext&))) { #elif defined(PADDLE_WITH_XPU) @@ -63,6 +66,7 @@ struct KernelArgsParseFunctor { #elif defined(PADDLE_WITH_CUSTOM_DEVICE) || arg_type == std::type_index(typeid(const CustomContext&))) { #else + ) { #endif // do nothing, skip context arg now diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 3b5fd0247a484..73814fc3f4048 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -17,6 +17,7 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/onednn/onednn_context.h" #include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" @@ -257,7 +258,9 @@ struct KernelImpl { #ifdef PADDLE_WITH_CUSTOM_DEVICE PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext); #endif - +#ifdef PADDLE_WITH_MKLDNN + PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(OneDNNContext); +#endif /* Input Helpers */ PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index af6cfb8812de8..455d42b548606 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -113,11 +113,13 @@ file( # file(GLOB kernel_cudnn "gpudnn/*.cu") # file(GLOB kernel_kps "kps/*.cu") file(GLOB kernel_xpu "xpu/*.cc") +file(GLOB kernel_onednn "onednn/*.cc") add_library(phi_cpu ${kernel_cc}) kernel_declare("${kernel_cc}") target_link_libraries(phi_cpu ${COMMON_KERNEL_DEPS}) -set_property(GLOBAL PROPERTY PHI_KERNELS phi_cpu) + +set(ADD_PHI_KERNELS phi_cpu) if(WITH_GPU OR WITH_ROCM) if(WITH_GPU) @@ -127,7 +129,7 @@ if(WITH_GPU OR WITH_ROCM) endif() kernel_declare("${kernel_cu}") target_link_libraries(phi_gpu ${COMMON_KERNEL_DEPS}) - set_property(GLOBAL PROPERTY PHI_KERNELS phi_cpu phi_gpu) + set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_gpu) endif() if(WITH_XPU) @@ -148,5 +150,15 @@ if(WITH_XPU) kernel_declare("${kernel_xpu}") kernel_declare("${kernel_xpu_kps}") target_link_libraries(phi_xpu ${COMMON_KERNEL_DEPS}) - set_property(GLOBAL PROPERTY PHI_KERNELS phi_cpu phi_xpu) + set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_xpu) +endif() + +if(WITH_MKLDNN) + add_library(phi_onednn ${kernel_onednn}) + kernel_declare(${kernel_onednn}) + set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} onednn_context) + target_link_libraries(phi_onednn ${COMMON_KERNEL_DEPS}) + set(ADD_PHI_KERNELS ${ADD_PHI_KERNELS} phi_onednn) endif() + +set_property(GLOBAL PROPERTY PHI_KERNELS ${ADD_PHI_KERNELS}) diff --git a/paddle/phi/kernels/cpu/log_softmax_kernel.cc b/paddle/phi/kernels/cpu/log_softmax_kernel.cc index 510eb7a6ca97a..0ba4aea78c3ca 100644 --- a/paddle/phi/kernels/cpu/log_softmax_kernel.cc +++ b/paddle/phi/kernels/cpu/log_softmax_kernel.cc @@ -116,5 +116,7 @@ void LogSoftmaxKernel(const Context& dev_ctx, } // namespace phi +// TODO(YuanRisheng): The layout of mkldnn kernel should be MKLDNN, we should +// support specifying the exact layout when the kernel is registered PD_REGISTER_KERNEL( log_softmax, CPU, ALL_LAYOUT, phi::LogSoftmaxKernel, float, double) {} diff --git a/paddle/phi/kernels/onednn/log_softmax_kernel.cc b/paddle/phi/kernels/onednn/log_softmax_kernel.cc new file mode 100644 index 0000000000000..254e975dd45ec --- /dev/null +++ b/paddle/phi/kernels/onednn/log_softmax_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/log_softmax_kernel.h" + +#include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/backends/onednn/onednn_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +class LogSoftmaxMKLDNNHandler + : public paddle::platform:: + MKLDNNHandlerNoCachingT { + public: + LogSoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine, + Place cpu_place, + const DenseTensor& x, + const int axis) + : paddle::platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, cpu_place) { + this->AcquireForwardPrimitiveDescriptor( + dnnl::prop_kind::forward_inference, x.mem_desc(), axis); + } +}; + +template +void LogSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { + const auto& mkldnn_engine = dev_ctx.GetEngine(); + axis = axis >= 0 ? axis : x.dims().size() + axis; + + LogSoftmaxMKLDNNHandler handler( + mkldnn_engine, dev_ctx.GetPlace(), x, axis); + + auto src_memory_p = handler.AcquireSrcMemory(&x); + auto dst_memory_p = handler.AcquireDstMemory(out); + + auto logsoftmax_p = handler.AcquireForwardPrimitive(); + + auto& astream = OneDNNContext::tls().get_stream(); + logsoftmax_p->execute( + astream, {{DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}); + astream.wait(); + + out->set_mem_desc(dst_memory_p->get_desc()); +} + +} // namespace phi + +PD_REGISTER_KERNEL(log_softmax, + OneDNN, + ALL_LAYOUT, + phi::LogSoftmaxKernel, + float, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc index c1550e31fae88..415c1f21465ed 100644 --- a/paddle/phi/tests/common/test_backend.cc +++ b/paddle/phi/tests/common/test_backend.cc @@ -39,8 +39,8 @@ TEST(Backend, OStream) { oss << phi::Backend::NPU; EXPECT_EQ(oss.str(), "NPU"); oss.str(""); - oss << phi::Backend::MKLDNN; - EXPECT_EQ(oss.str(), "MKLDNN"); + oss << phi::Backend::ONEDNN; + EXPECT_EQ(oss.str(), "ONEDNN"); oss.str(""); oss << phi::Backend::GPUDNN; EXPECT_EQ(oss.str(), "GPUDNN"); @@ -63,7 +63,7 @@ TEST(Backend, StringToBackend) { EXPECT_EQ(phi::Backend::GPU, pexp::StringToBackend("GPU")); EXPECT_EQ(phi::Backend::XPU, pexp::StringToBackend("XPU")); EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU")); - EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN")); + EXPECT_EQ(phi::Backend::ONEDNN, pexp::StringToBackend("OneDNN")); EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN")); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) EXPECT_EQ(phi::Backend::GPU, pexp::StringToBackend("KPS")); From 4baf0dbe742100e5ffe63f6fd19f92cb280f818c Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Thu, 14 Jul 2022 18:51:22 +0800 Subject: [PATCH 209/250] Compilation optimization (#44242) * Compilation optimization --- .../eager_generated/backwards/CMakeLists.txt | 4 +- .../eager_generated/forwards/CMakeLists.txt | 4 +- .../eager_manual/forwards/CMakeLists.txt | 17 +- .../manual/eager_manual/nodes/CMakeLists.txt | 13 +- .../fluid_manual/forwards/CMakeLists.txt | 29 +- .../manual/fluid_manual/nodes/CMakeLists.txt | 19 +- .../auto_code_generator/eager_generator.cc | 45 +- .../generate_file_structures.py | 17 +- .../framework/new_executor/CMakeLists.txt | 81 +- paddle/fluid/imperative/CMakeLists.txt | 19 +- paddle/fluid/operators/CMakeLists.txt | 2 +- .../platform/device/gpu/cuda/CMakeLists.txt | 2 +- paddle/fluid/pybind/.gitignore | 9 +- paddle/fluid/pybind/CMakeLists.txt | 122 +- .../fluid/pybind/generate_file_structures.py | 8 +- paddle/fluid/pybind/imperative.cc | 10 +- paddle/fluid/pybind/op_function.h | 9 +- paddle/fluid/pybind/op_function_generator.cc | 111 +- paddle/fluid/pybind/parallel_executor.cc | 1118 ++++++++ paddle/fluid/pybind/parallel_executor.h | 25 + paddle/fluid/pybind/place.cc | 816 ++++++ paddle/fluid/pybind/place.h | 25 + paddle/fluid/pybind/pybind.cc | 2447 +---------------- paddle/fluid/pybind/tensor.cc | 1106 ++++++++ paddle/fluid/pybind/tensor.h | 25 + 25 files changed, 3420 insertions(+), 2663 deletions(-) create mode 100644 paddle/fluid/pybind/parallel_executor.cc create mode 100644 paddle/fluid/pybind/parallel_executor.h create mode 100644 paddle/fluid/pybind/place.cc create mode 100644 paddle/fluid/pybind/place.h create mode 100644 paddle/fluid/pybind/tensor.cc create mode 100644 paddle/fluid/pybind/tensor.h diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt index fbd552ef00da7..1f2b30853c6bf 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt @@ -6,7 +6,7 @@ cc_library( if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library( final_dygraph_node - SRCS nodes.cc - DEPS ${eager_deps} ${eager_manual_nodes}) + SRCS nodes.cc ${eager_manual_nodes} + DEPS ${eager_deps}) add_dependencies(final_dygraph_node eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt index 66053baa5813b..9baf8956fe2e4 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt @@ -6,7 +6,7 @@ cc_library( if(NOT (NOT WITH_PYTHON AND ON_INFER)) cc_library( final_dygraph_function - SRCS dygraph_functions.cc - DEPS ${eager_deps} ${eager_manual_functions}) + SRCS dygraph_functions.cc ${eager_manual_functions} + DEPS ${eager_deps}) add_dependencies(final_dygraph_function eager_final_state_codegen) endif() diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt index d71f1153e2fc0..d25b3ba08b5a6 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/CMakeLists.txt @@ -1,17 +1,4 @@ -cc_library( - add_n_fwd_func - SRCS add_n_fwd_func.cc - DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - -add_dependencies(add_n_fwd_func eager_codegen) - -cc_library( - conv2d_fwd_function - SRCS conv2d_fwd_function.cc - DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - -add_dependencies(conv2d_fwd_function eager_codegen) - set(eager_manual_functions - conv2d_fwd_function add_n_fwd_func + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt index fa6a9a53abae3..ac5ce176f4e37 100644 --- a/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/CMakeLists.txt @@ -1,13 +1,4 @@ -cc_library( - add_n_node - SRCS add_n_node.cc - DEPS ${eager_deps} ${fluid_deps}) - -cc_library( - conv2d_nodes - SRCS conv2d_nodes.cc - DEPS ${eager_deps} ${fluid_deps}) - set(eager_manual_nodes - conv2d_nodes add_n_node + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/nodes/conv2d_nodes.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt index 295b8d9a6408f..5c47b0870a203 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt @@ -1,28 +1,5 @@ -cc_library( - fused_gate_attention_fwd_func - SRCS fused_gate_attention_fwd_func.cc - DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - -add_dependencies(fused_gate_attention_fwd_func eager_codegen - copy_dygraph_forward_functions) - -cc_library( - fused_feedforward_fwd_func - SRCS fused_feedforward_fwd_func.cc - DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - -add_dependencies(fused_feedforward_fwd_func eager_codegen - copy_dygraph_forward_functions) - -cc_library( - fused_attention_fwd_func - SRCS fused_attention_fwd_func.cc - DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - -add_dependencies(fused_attention_fwd_func eager_codegen - copy_dygraph_forward_functions) - set(fluid_manual_functions - fused_gate_attention_fwd_func fused_feedforward_fwd_func - fused_attention_fwd_func + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt index 28c034e8b5ddb..101ed5d589075 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt @@ -1,18 +1,5 @@ -cc_library( - fused_gate_attention_node - SRCS fused_gate_attention_node.cc - DEPS ${eager_deps} ${fluid_deps}) - -cc_library( - fused_feedforward_node - SRCS fused_feedforward_node.cc - DEPS ${eager_deps} ${fluid_deps}) - -cc_library( - fused_attention_node - SRCS fused_attention_node.cc - DEPS ${eager_deps} ${fluid_deps}) - set(fluid_manual_nodes - fused_gate_attention_node fused_feedforward_node fused_attention_node + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc PARENT_SCOPE) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 4f5efe74fa9a6..54b40c72d0215 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -3083,27 +3083,44 @@ static std::string ConvertCoreOpsInfosToString( return core_ops_returns_info_init_str; } -static std::string GenerateCoreOpsReturnsInfo() { +static std::string GenerateCoreOpsArgsInfo() { const char* Core_Ops_Returns_MAP_TEMPLATE = "std::unordered_map> " - "core_ops_args_info = { %s };\n" - "std::unordered_map> " - "core_ops_args_type_info = { %s };\n" - "std::unordered_map> " - "core_ops_returns_info = { %s };\n"; + "core_ops_args_info = { %s };\n"; std::string core_ops_args_info_init_str = ConvertCoreOpsInfosToString(core_ops_args_info); + + std::string core_ops_info_str = paddle::string::Sprintf( + Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_info_init_str); + + return core_ops_info_str; +} + +static std::string GenerateCoreOpsArgsTypeInfo() { + const char* Core_Ops_Returns_MAP_TEMPLATE = + "std::unordered_map> " + "core_ops_args_type_info = { %s };\n"; + std::string core_ops_args_type_info_init_str = ConvertCoreOpsInfosToString(core_ops_args_type_info); + + std::string core_ops_info_str = paddle::string::Sprintf( + Core_Ops_Returns_MAP_TEMPLATE, core_ops_args_type_info_init_str); + + return core_ops_info_str; +} + +static std::string GenerateCoreOpsReturnsInfo() { + const char* Core_Ops_Returns_MAP_TEMPLATE = + "std::unordered_map> " + "core_ops_returns_info = { %s };\n"; + std::string core_ops_returns_info_init_str = ConvertCoreOpsInfosToString(core_ops_returns_info); - std::string core_ops_info_str = - paddle::string::Sprintf(Core_Ops_Returns_MAP_TEMPLATE, - core_ops_args_info_init_str, - core_ops_args_type_info_init_str, - core_ops_returns_info_init_str); + std::string core_ops_info_str = paddle::string::Sprintf( + Core_Ops_Returns_MAP_TEMPLATE, core_ops_returns_info_init_str); return core_ops_info_str; } @@ -3252,6 +3269,12 @@ static void DygraphCodeGeneration(const std::string& output_dir, GenerateForwardDygraphFile( output_dir + "/forwards/dygraph_forward_functions_args_info.tmp.cc", + GenerateCoreOpsArgsInfo()); + GenerateForwardDygraphFile( + output_dir + "/forwards/dygraph_forward_functions_args_type_info.tmp.cc", + GenerateCoreOpsArgsTypeInfo()); + GenerateForwardDygraphFile( + output_dir + "/forwards/dygraph_forward_functions_returns_info.tmp.cc", GenerateCoreOpsReturnsInfo()); VLOG(6) << "-------- GenerateNodeCCFile -------"; diff --git a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py index d6574bc2e81fb..9fbf1ed6cd4a1 100644 --- a/paddle/fluid/eager/auto_code_generator/generate_file_structures.py +++ b/paddle/fluid/eager/auto_code_generator/generate_file_structures.py @@ -96,6 +96,11 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count): "nodes" + str(i + 1) + ".cc")) empty_files.append( os.path.join(forwards_dir, "dygraph_forward_functions_args_info.cc")) + empty_files.append( + os.path.join(forwards_dir, + "dygraph_forward_functions_args_type_info.cc")) + empty_files.append( + os.path.join(forwards_dir, "dygraph_forward_functions_returns_info.cc")) for path in empty_files: if not os.path.exists(path): open(path, 'a').close() @@ -125,7 +130,7 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count): f.write("cc_library(dygraph_node SRCS ") for i in range(split_count): f.write("nodes" + str(i + 1) + ".cc ") - f.write("DEPS ${eager_deps} ${fluid_deps} ${fluid_manual_nodes})\n") + f.write("${fluid_manual_nodes} DEPS ${eager_deps} ${fluid_deps})\n") f.write("add_dependencies(dygraph_node copy_dygraph_node)") with open(forwards_level_cmakelist_path, "w") as f: @@ -143,6 +148,12 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count): f.write( " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_info.cc\"\n" ) + f.write( + " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_type_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_args_type_info.cc\"\n" + ) + f.write( + " COMMAND ${CMAKE_COMMAND} -E copy_if_different \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.tmp.cc\" \"${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/fluid_generated/forwards/dygraph_forward_functions_returns_info.cc\"\n" + ) f.write(" DEPENDS eager_codegen\n") f.write(" VERBATIM)\n") @@ -150,8 +161,10 @@ def GenerateFileStructureForIntermediateDygraph(eager_dir, split_count): for i in range(split_count): f.write("dygraph_forward_functions" + str(i + 1) + ".cc ") f.write("dygraph_forward_functions_args_info.cc ") + f.write("dygraph_forward_functions_args_type_info.cc ") + f.write("dygraph_forward_functions_returns_info.cc ") f.write( - "DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ${fluid_manual_functions})\n" + "${fluid_manual_functions} DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})\n" ) f.write( "add_dependencies(dygraph_function copy_dygraph_forward_functions)") diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index cf10734d1deeb..006e98f175423 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -1,4 +1,16 @@ -set(INTERPRETERCORE_DEPS +add_subdirectory(workqueue) +add_subdirectory(garbage_collector) + +set(STANDALONE_EXECUTOR_SRCS + data_transfer.cc + new_executor_defs.cc + interpretercore_util.cc + event_manager.cc + stream_analyzer.cc + interpretercore.cc + standalone_executor.cc) + +set(STANDALONE_EXECUTOR_DEPS op_registry device_context scope @@ -20,62 +32,33 @@ set(INTERPRETERCORE_DEPS variable_helper timer monitor - nan_inf_utils) - -add_subdirectory(workqueue) -add_subdirectory(garbage_collector) - -cc_library( - data_transfer - SRCS data_transfer.cc - DEPS enforce scope glog) -cc_library( - new_executor_defs - SRCS new_executor_defs.cc - DEPS enforce glog scope) -cc_library( - interpretercore_util - SRCS interpretercore_util.cc - DEPS ${INTERPRETERCORE_DEPS} workqueue new_executor_defs data_transfer) -cc_library( - event_manager - SRCS event_manager.cc - DEPS ${DEVICE_EVENT_LIBS} glog new_executor_defs) -cc_library( - stream_analyzer - SRCS stream_analyzer.cc - DEPS ${DEVICE_EVENT_LIBS} glog device_context new_executor_defs) + nan_inf_utils + enforce + scope + glog + enforce + glog + scope + workqueue + interpretercore_event_garbage_collector + ${DEVICE_EVENT_LIBS} + glog) if(WITH_GPU OR WITH_ROCM) - cc_library( - interpretercore - SRCS interpretercore.cc - DEPS workqueue - ${DEVICE_EVENT_LIBS} - interpretercore_util - interpretercore_event_garbage_collector - interpretercore_fast_garbage_collector - stream_analyzer - event_manager) -else() - cc_library( - interpretercore - SRCS interpretercore.cc - DEPS workqueue ${DEVICE_EVENT_LIBS} interpretercore_util - interpretercore_event_garbage_collector stream_analyzer event_manager) + set(STANDALONE_EXECUTOR_DEPS ${STANDALONE_EXECUTOR_DEPS} + interpretercore_fast_garbage_collector) endif() cc_library( standalone_executor - SRCS standalone_executor.cc - DEPS interpretercore) + SRCS ${STANDALONE_EXECUTOR_SRCS} + DEPS ${STANDALONE_EXECUTOR_DEPS}) cc_library( staticgraph_executor_statistics SRCS executor_statistics.cc DEPS enforce glog os_info) -# cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) # skip win32 since wget is not installed by default on windows machine. if(WITH_GPU AND WITH_TESTING @@ -120,13 +103,7 @@ if(WITH_GPU cc_test( standalone_executor_test SRCS standalone_executor_test.cc - DEPS interpretercore - standalone_executor - operator - op_registry - executor - ${OPS} - ${OP_DEPS}) + DEPS standalone_executor operator op_registry executor ${OPS} ${OP_DEPS}) set_tests_properties(standalone_executor_test PROPERTIES TIMEOUT 100) add_dependencies(standalone_executor_test download_program) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 2d4a57b82a186..98ece2db96c1b 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -5,7 +5,7 @@ cc_library( cc_library( var_helper SRCS var_helper.cc - DEPS tensor phi_api) + DEPS tensor selected_rows) if(WITH_XPU) cc_library( prepared_operator @@ -20,8 +20,8 @@ if(WITH_XPU) op_kernel_type data_transform nan_inf_utils - phi_api - phi_utils + scalar + int_array var_helper profiler) else() @@ -37,21 +37,16 @@ else() op_kernel_type data_transform nan_inf_utils - phi_api - phi_utils + scalar + int_array var_helper profiler) endif() cc_library( layer SRCS layer.cc - DEPS prepared_operator - math_function - imperative_flag - variable_helper - op_registry - var_helper - phi_api) + DEPS prepared_operator math_function imperative_flag variable_helper + op_registry var_helper) add_subdirectory(jit) if(WITH_GPU) cc_library( diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 7fb00504ee2db..809ad5174b60b 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -101,7 +101,7 @@ else() cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor) endif() -set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel) +set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta) register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt index da9121550e07a..15c7a6c462495 100644 --- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt @@ -10,4 +10,4 @@ nv_library( nv_test( cudnn_helper_test SRCS cudnn_helper_test.cc - DEPS dynload_cuda phi) + DEPS dynload_cuda) diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore index bd45f1ec2ea30..a6f20e21801f7 100644 --- a/paddle/fluid/pybind/.gitignore +++ b/paddle/fluid/pybind/.gitignore @@ -1,4 +1,11 @@ pybind.h -op_function.cc +op_function1.cc +op_function2.cc +op_function3.cc +op_function4.cc +op_function5.cc +op_function6.cc +op_function7.cc +op_function8.cc eager_op_function.cc eager_final_state_op_function.cc diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index d5c7bcc30d176..f301189d77824 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -102,13 +102,16 @@ endif() set(PYBIND_SRCS pybind.cc imperative.cc - op_function.cc inference_api.cc ir.cc bind_fleet_executor.cc reader_py.cc protobuf.cc exception.cc + op_function_common.cc + parallel_executor.cc + tensor.cc + place.cc const_value.cc global_value_getter_setter.cc fleet_wrapper_py.cc @@ -124,13 +127,15 @@ set(PYBIND_SRCS generator_py.cc communication.cc cuda_streams_py.cc - jit.cc) - -execute_process( - COMMAND - "${PYTHON_EXECUTABLE}" - "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/generate_file_structures.py" - "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/") + jit.cc + op_function1.cc + op_function2.cc + op_function3.cc + op_function4.cc + op_function5.cc + op_function6.cc + op_function7.cc + op_function8.cc) if(WITH_CUSTOM_DEVICE) set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi) @@ -267,12 +272,35 @@ if(WITH_PYTHON) target_link_libraries(kernel_signature_generator ${ROCM_HIPRTC_LIB}) endif() - set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function.cc) - set(tmp_impl_file ${impl_file}.tmp) + set(op_function_output_path ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/) + set(impl_file1 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function1.cc) + set(tmp_impl_file1 ${impl_file1}.tmp) + set(impl_file2 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function2.cc) + set(tmp_impl_file2 ${impl_file2}.tmp) + set(impl_file3 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function3.cc) + set(tmp_impl_file3 ${impl_file3}.tmp) + set(impl_file4 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function4.cc) + set(tmp_impl_file4 ${impl_file4}.tmp) + set(impl_file5 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function5.cc) + set(tmp_impl_file5 ${impl_file5}.tmp) + set(impl_file6 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function6.cc) + set(tmp_impl_file6 ${impl_file6}.tmp) + set(impl_file7 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function7.cc) + set(tmp_impl_file7 ${impl_file7}.tmp) + set(impl_file8 ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function8.cc) + set(tmp_impl_file8 ${impl_file8}.tmp) + set(CODE_GEN_SPLIT_FILE_COUNT "8") set(eager_impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/eager_op_function.cc) set(tmp_eager_impl_file ${eager_impl_file}.tmp) + execute_process( + COMMAND + "${PYTHON_EXECUTABLE}" + "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/generate_file_structures.py" + "${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/" + "${CODE_GEN_SPLIT_FILE_COUNT}") + set(OP_IMPL_DEPS op_function_generator) set(EAGER_OP_IMPL_DEPS eager_op_function_generator eager_final_state_python_c_codegen) @@ -292,7 +320,7 @@ if(WITH_PYTHON) ":retry\n" "ECHO op_function_generator run %build_times% time\n" "taskkill /f /im op_function_generator.exe 2>NUL\n" - "${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n" + "${op_impl_path}/op_function_generator.exe ${op_function_output_path} ${CODE_GEN_SPLIT_FILE_COUNT}\n" "if %ERRORLEVEL% NEQ 0 (\n" " set /a build_times=%build_times%+1\n" " if %build_times% GEQ 10 (\n" @@ -367,12 +395,33 @@ if(WITH_PYTHON) endif() add_custom_command( - OUTPUT ${impl_file} + OUTPUT op_function COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} - ${impl_file} - COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file1} + ${impl_file1} + COMMENT "copy_if_different ${tmp_impl_file1} to ${impl_file1}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file2} + ${impl_file2} + COMMENT "copy_if_different ${tmp_impl_file2} to ${impl_file2}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file3} + ${impl_file3} + COMMENT "copy_if_different ${tmp_impl_file3} to ${impl_file3}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file4} + ${impl_file4} + COMMENT "copy_if_different ${tmp_impl_file4} to ${impl_file4}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file5} + ${impl_file5} + COMMENT "copy_if_different ${tmp_impl_file5} to ${impl_file5}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file6} + ${impl_file6} + COMMENT "copy_if_different ${tmp_impl_file6} to ${impl_file6}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file7} + ${impl_file7} + COMMENT "copy_if_different ${tmp_impl_file7} to ${impl_file7}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file8} + ${impl_file8} + COMMENT "copy_if_different ${tmp_impl_file8} to ${impl_file8}" DEPENDS ${OP_IMPL_DEPS}) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_custom_command( @@ -431,13 +480,35 @@ if(WITH_PYTHON) list(APPEND EAGER_OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0) endif() add_custom_command( - OUTPUT ${impl_file} + OUTPUT op_function COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:." - "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" "${tmp_impl_file}" - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} - ${impl_file} - COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" + "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" + "${op_function_output_path}" "${CODE_GEN_SPLIT_FILE_COUNT}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file1} + ${impl_file1} + COMMENT "copy_if_different ${tmp_impl_file1} to ${impl_file1}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file2} + ${impl_file2} + COMMENT "copy_if_different ${tmp_impl_file2} to ${impl_file2}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file3} + ${impl_file3} + COMMENT "copy_if_different ${tmp_impl_file3} to ${impl_file3}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file4} + ${impl_file4} + COMMENT "copy_if_different ${tmp_impl_file4} to ${impl_file4}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file5} + ${impl_file5} + COMMENT "copy_if_different ${tmp_impl_file5} to ${impl_file5}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file6} + ${impl_file6} + COMMENT "copy_if_different ${tmp_impl_file6} to ${impl_file6}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file7} + ${impl_file7} + COMMENT "copy_if_different ${tmp_impl_file7} to ${impl_file7}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file8} + ${impl_file8} + COMMENT "copy_if_different ${tmp_impl_file8} to ${impl_file8}" DEPENDS ${OP_IMPL_DEPS} VERBATIM) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) @@ -454,19 +525,13 @@ if(WITH_PYTHON) VERBATIM) endif() endif() - add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file}) + add_custom_target(op_function_generator_cmd ALL DEPENDS op_function) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) add_custom_target(eager_op_function_generator_cmd ALL DEPENDS ${eager_impl_file}) endif() - list(APPEND PYBIND_DEPS interpretercore standalone_executor - staticgraph_executor_statistics) - cc_library( - op_function_common - SRCS op_function_common.cc - DEPS ${PYBIND_DEPS}) - list(APPEND PYBIND_DEPS op_function_common) + list(APPEND PYBIND_DEPS standalone_executor staticgraph_executor_statistics) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) set(PYBIND_SRCS eager.cc ${PYBIND_SRCS}) @@ -482,7 +547,6 @@ if(WITH_PYTHON) list(APPEND PYBIND_DEPS backward) list(APPEND PYBIND_DEPS grad_node_info) list(APPEND PYBIND_DEPS phi) - list(APPEND PYBIND_DEPS op_function_common) list(APPEND PYBIND_DEPS final_dygraph_function) list(APPEND PYBIND_DEPS final_dygraph_node) list(APPEND PYBIND_DEPS dygraph_function) diff --git a/paddle/fluid/pybind/generate_file_structures.py b/paddle/fluid/pybind/generate_file_structures.py index 391c47b8ee700..bc61ecdcc96f5 100644 --- a/paddle/fluid/pybind/generate_file_structures.py +++ b/paddle/fluid/pybind/generate_file_structures.py @@ -16,12 +16,16 @@ import os if __name__ == "__main__": - assert len(sys.argv) == 2 + assert len(sys.argv) == 3 pybind_dir = sys.argv[1] + split_count = int(sys.argv[2]) empty_files = [os.path.join(pybind_dir, "eager_final_state_op_function.cc")] empty_files.append(os.path.join(pybind_dir, "eager_op_function.cc")) - empty_files.append(os.path.join(pybind_dir, "op_function.cc")) + + for i in range(split_count): + empty_files.append( + os.path.join(pybind_dir, "op_function" + str(i + 1) + ".cc")) for path in empty_files: if not os.path.exists(path): diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 569890fa25cd6..8a21271db409f 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -64,6 +64,7 @@ limitations under the License. */ namespace paddle { namespace pybind { +std::atomic VarBaseUniqueNameID{0}; PyTypeObject *g_varbase_pytype = nullptr; namespace py = ::pybind11; @@ -497,7 +498,14 @@ static void VarBaseCopy(std::shared_ptr &src, // NOLINT void BindImperative(py::module *m_ptr) { auto &m = *m_ptr; - BindOpFunctions(&m); + BindOpFunctions1(&m); + BindOpFunctions2(&m); + BindOpFunctions3(&m); + BindOpFunctions4(&m); + BindOpFunctions5(&m); + BindOpFunctions6(&m); + BindOpFunctions7(&m); + BindOpFunctions8(&m); #ifndef _WIN32 // Dygraph DataLoader signal handler diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h index 5038dd5e6c4e2..884136ec0d37b 100644 --- a/paddle/fluid/pybind/op_function.h +++ b/paddle/fluid/pybind/op_function.h @@ -257,7 +257,14 @@ PyObject* MakeReturnPyObject(const std::tuple& out) { return result; } -void BindOpFunctions(pybind11::module* module); +void BindOpFunctions1(pybind11::module* module); +void BindOpFunctions2(pybind11::module* module); +void BindOpFunctions3(pybind11::module* module); +void BindOpFunctions4(pybind11::module* module); +void BindOpFunctions5(pybind11::module* module); +void BindOpFunctions6(pybind11::module* module); +void BindOpFunctions7(pybind11::module* module); +void BindOpFunctions8(pybind11::module* module); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 9ddf0e7083f44..7eeadac7cef2e 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -422,13 +422,17 @@ std::string GenerateOpFunctionsBody( return op_function_str; } -static std::tuple, std::vector> -GenerateOpFunctions() { +static std::vector< + std::tuple, std::vector>> +GenerateOpFunctions(int split_count) { auto& op_info_map = paddle::framework::OpInfoMap::Instance().map(); - + std::vector, std::vector>> + result; std::vector op_function_list, bind_function_list; auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels(); + paddle::flat_hash_map + op_info_map_need_gen; for (auto& pair : op_info_map) { auto& op_info = pair.second; auto op_proto = op_info.proto_; @@ -444,6 +448,22 @@ GenerateOpFunctions() { continue; } + op_info_map_need_gen.emplace(pair); + } + + int cc_file_api_size = op_info_map_need_gen.size() / split_count; + if (op_info_map_need_gen.size() % split_count != 0) { + cc_file_api_size++; + } + int api_index = 0; + int file_index = 0; + + for (auto& pair : op_info_map_need_gen) { + auto& op_info = pair.second; + auto op_proto = op_info.proto_; + + auto& op_type = op_proto->type(); + // NOTE(pangyoki): Inplace Strategy. // In this case, output will reuse input varbase. // Dygraph mode needs to be aligned with the in-place strategy in static @@ -489,13 +509,24 @@ GenerateOpFunctions() { op_function_list.emplace_back(std::move(inplace_op_function_str)); bind_function_list.emplace_back(std::move(inplace_bind_function_str)); } + + api_index++; + if (api_index / cc_file_api_size > file_index) { + file_index++; + result.push_back(std::make_tuple(op_function_list, bind_function_list)); + op_function_list.clear(); + bind_function_list.clear(); + } } - return std::make_tuple(op_function_list, bind_function_list); + + result.push_back(std::make_tuple(op_function_list, bind_function_list)); + + return result; } int main(int argc, char* argv[]) { - if (argc != 2) { - std::cerr << "argc must be 2" << std::endl; + if (argc != 3) { + std::cerr << "argc must be 3" << std::endl; return -1; } @@ -513,39 +544,45 @@ int main(int argc, char* argv[]) { "\"paddle/fluid/pybind/op_function.h\"", ""}; - std::ofstream out(argv[1], std::ios::out); + std::string path = argv[1]; + int split_count = atoi(argv[2]); - for (auto& header : headers) { - out << "#include " + header + "\n"; - } + auto op_funcs = GenerateOpFunctions(split_count); - out << "\n\n"; - - auto op_funcs = GenerateOpFunctions(); - - out << "namespace paddle {\n" - << "namespace pybind {\n\n"; - out << "std::atomic VarBaseUniqueNameID{0};\n"; - out << paddle::string::join_strings(std::get<0>(op_funcs), '\n'); - out << "\n\n"; - - out << "static PyMethodDef ExtestMethods[] = {\n" - << paddle::string::join_strings(std::get<1>(op_funcs), '\n') - << "\n {nullptr,nullptr,0,nullptr}" - << "};\n\n"; - - out << "void BindOpFunctions(pybind11::module *module) {\n" - << " auto m = module->def_submodule(\"ops\");\n" - << " if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n" - << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " - "core.ops failed!\"));\n" - << " }\n\n" - << " InitOpsAttrTypeMap();" - << "}\n\n" - << "} // namespace pybind\n" - << "} // namespace paddle\n"; - - out.close(); + for (size_t i = 0; i < op_funcs.size(); i++) { + std::ofstream out(path + "op_function" + std::to_string(i + 1) + ".cc.tmp", + std::ios::out); + + for (auto& header : headers) { + out << "#include " + header + "\n"; + } + + out << "\n\n"; + + out << "namespace paddle {\n" + << "namespace pybind {\n\n"; + out << "extern std::atomic VarBaseUniqueNameID;\n"; + out << paddle::string::join_strings(std::get<0>(op_funcs[i]), '\n'); + out << "\n\n"; + + out << "static PyMethodDef ExtestMethods[] = {\n" + << paddle::string::join_strings(std::get<1>(op_funcs[i]), '\n') + << "\n {nullptr,nullptr,0,nullptr}" + << "};\n\n"; + + out << "void BindOpFunctions" << i + 1 << "(pybind11::module *module) {\n" + << " auto m = module->def_submodule(\"ops\");\n" + << " if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n" + << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " + "core.ops failed!\"));\n" + << " }\n\n" + << " InitOpsAttrTypeMap();" + << "}\n\n" + << "} // namespace pybind\n" + << "} // namespace paddle\n"; + + out.close(); + } #ifdef PADDLE_WITH_ASCEND_CL ge::GEFinalize(); diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc new file mode 100644 index 0000000000000..f1d2f456a28d9 --- /dev/null +++ b/paddle/fluid/pybind/parallel_executor.cc @@ -0,0 +1,1118 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include +#include +#include +#include +#include +#include +#include // NOLINT // for call_once +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/executor_cache.h" +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" +#include "paddle/fluid/framework/ir/cost_model.h" +#include "paddle/fluid/framework/ir/generate_pass.h" +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/new_executor/executor_statistics.h" +#include "paddle/fluid/framework/new_executor/standalone_executor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/framework/phi_utils.h" +#include "paddle/fluid/framework/prune.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/save_load_util.h" +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/selected_rows_utils.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/trainer.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/memory/allocation/allocator_strategy.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#endif +#include "paddle/fluid/memory/allocation/mmap_allocator.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/operators/py_func_op.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_python.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/profiler.h" +#include "paddle/fluid/pybind/cuda_streams_py.h" +#include "paddle/fluid/pybind/distributed_py.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/pybind/io.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/lod_utils.h" +#include "paddle/utils/none.h" +#ifdef PADDLE_WITH_ASCEND +#include "paddle/fluid/pybind/ascend_wrapper_py.h" +#endif +#include "paddle/fluid/pybind/bind_cost_model.h" +#include "paddle/fluid/pybind/bind_fleet_executor.h" +#include "paddle/fluid/pybind/box_helper_py.h" +#include "paddle/fluid/pybind/communication.h" +#include "paddle/fluid/pybind/compatible.h" +#include "paddle/fluid/pybind/const_value.h" +#include "paddle/fluid/pybind/data_set_py.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/fleet_wrapper_py.h" +#include "paddle/fluid/pybind/generator_py.h" +#include "paddle/fluid/pybind/global_value_getter_setter.h" +#include "paddle/fluid/pybind/gloo_context_py.h" +#include "paddle/fluid/pybind/gloo_wrapper_py.h" +#include "paddle/fluid/pybind/heter_wrapper_py.h" +#include "paddle/fluid/pybind/inference_api.h" +#include "paddle/fluid/pybind/ir.h" +#include "paddle/fluid/pybind/metrics_py.h" +#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/phi/backends/device_manager.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/pybind/nccl_wrapper_py.h" +#endif +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/pybind/protobuf.h" +#include "paddle/fluid/pybind/pybind.h" // NOLINT +#include "paddle/fluid/pybind/reader_py.h" +#include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/fluid/string/to_string.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#ifndef PADDLE_WITH_HIP +#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" +#endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#endif + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#endif + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" +#endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/capi/capi.h" +#endif + +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#endif + +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif + +#ifdef PADDLE_WITH_CRYPTO +#include "paddle/fluid/pybind/crypto.h" +#endif + +#if defined PADDLE_WITH_PSCORE +#include "paddle/fluid/pybind/fleet_py.h" +#endif + +#ifdef PADDLE_WITH_CINN +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#endif + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/imperative/layout_autotune.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/parallel_executor.h" +#include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" +#include "pybind11/stl.h" + +DECLARE_bool(use_mkldnn); + +// disable auto conversion to list in Python +PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); + +namespace paddle { +namespace pybind { +using namespace paddle::framework; // NOLINT +void BindParallelExecutor(pybind11::module &m) { // NOLINT + // -- python binds for parallel executor. + py::class_ pe(m, "ParallelExecutor"); + py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( + ExecutionStrategy allows the user to more preciously control how to run + the program in ParallelExecutor by setting the property. + + Returns: + ExecutionStrategy: An ExecutionStrategy object. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + import paddle.nn.functional as F + + paddle.enable_static() + + x = static.data(name='x', shape=[None, 13], dtype='float32') + y = static.data(name='y', shape=[None, 1], dtype='float32') + y_predict = static.nn.fc(input=x, size=1, act=None) + + cost = F.square_error_cost(input=y_predict, label=y) + avg_loss = paddle.mean(cost) + + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_loss) + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_threads = 4 + + train_exe = static.ParallelExecutor(use_cuda=False, + loss_name=avg_loss.name, + exec_strategy=exec_strategy) + )DOC"); + + py::enum_(m, "DeviceType", py::arithmetic()) + .value("CPU", paddle::platform::DeviceType::CPU) + .value("CUDA", paddle::platform::DeviceType::CUDA) + .value("XPU", paddle::platform::DeviceType::XPU); + + exec_strategy.def(py::init()) + .def_property( + "num_threads", + [](const ExecutionStrategy &self) { return self.num_threads_; }, + [](ExecutionStrategy &self, size_t num_threads) { + self.num_threads_ = num_threads; + }, + R"DOC( + The type is INT, num_threads represents the size of thread pool that + used to run the operators of the current program in ParallelExecutor. + If :math:`num\_threads=1`, all the operators will execute one by one, + but the order maybe difference between iterations. + If it is not set, it will be set in ParallelExecutor according to the + device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, + :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. + if it is not set, ParallelExecutor will get the cpu count by calling + `multiprocessing.cpu_count()`. Default 0. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_threads = 4 + )DOC") + .def_property( + "_use_device", + [](const ExecutionStrategy &self) { return self.use_device_; }, + [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) { + self.use_device_ = use_device; + }) // NOTE(liuyuhui): Doesn't add doc for 'use_device', because + // use_device isn‘t exposed to users. + .def_property( + "allow_op_delay", + [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, + [](ExecutionStrategy &self, bool allow_op_delay) { + self.allow_op_delay_ = allow_op_delay; + }, + R"DOC(The type is BOOL, allow_op_delay represents whether to delay the + communication operators to run, it may make the execution faster. + Note that this option is invalid now, and it will be removed in + next version. Default False.)DOC") + .def_property( + "num_iteration_per_drop_scope", + [](const ExecutionStrategy &self) { + return self.num_iteration_per_drop_scope_; + }, + [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { + self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; + }, + R"DOC(The type is INT, num_iteration_per_drop_scope indicates how + many iterations to clean up the temp variables which + is generated during execution. It may make the execution faster, + because the temp variable's shape maybe the same between two iterations. + Default 100. + + .. note:: + 1. If you fetch data when calling the 'run', the ParallelExecutor + will clean up the temp variables at the end of the current iteration. + 2. In some NLP model, it may cause the GPU memory is insufficient, + in this case, you should reduce `num_iteration_per_drop_scope`. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_iteration_per_drop_scope = 10 + )DOC") + .def_property( + "num_iteration_per_run", + [](const ExecutionStrategy &self) { + return self.num_iteration_per_run_; + }, + [](ExecutionStrategy &self, size_t num_iteration_per_run) { + self.num_iteration_per_run_ = num_iteration_per_run; + }, + R"DOC(This config that how many iteration the executor will run when + user call exe.run() in python。Default: 1. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + exec_strategy = static.ExecutionStrategy() + exec_strategy.num_iteration_per_run = 10 + )DOC") + .def_property( + "use_thread_barrier", + [](const ExecutionStrategy &self) { return self.thread_barrier_; }, + [](ExecutionStrategy &self, bool use_thread_barrier) { + self.thread_barrier_ = use_thread_barrier; + }, + R"DOC(This config that the this is distributed training with parameter server + )DOC") + .def_property( + "_dry_run", + [](const ExecutionStrategy &self) { return self.dry_run_; }, + [](ExecutionStrategy &self, bool dry_run) { + self.dry_run_ = dry_run; + }); + + exec_strategy.def_property( + "use_experimental_executor", + [](const ExecutionStrategy &self) { + return self.type_ == ExecutionStrategy::kExperimental; + }, + [](ExecutionStrategy &self, bool experimental) { + self.type_ = experimental ? ExecutionStrategy::kExperimental + : ExecutionStrategy::kDefault; + }); + + py::class_ build_strategy(pe, "BuildStrategy", R"DOC( + BuildStrategy allows the user to more preciously control how to + build the SSA Graph in ParallelExecutor by setting the property. + + Returns: + BuildStrategy: An BuildStrategy object. + + Examples: + .. code-block:: python + + import os + import paddle + import paddle.static as static + + paddle.enable_static() + + os.environ['CPU_NUM'] = str(2) + places = static.cpu_places() + + data = static.data(name="x", shape=[None, 1], dtype="float32") + hidden = static.nn.fc(input=data, size=10) + loss = paddle.mean(hidden) + paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) + + build_strategy = static.BuildStrategy() + build_strategy.enable_inplace = True + build_strategy.memory_optimize = True + build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce + program = static.CompiledProgram(static.default_main_program()) + program = program.with_data_parallel(loss_name=loss.name, + build_strategy=build_strategy, + places=places) +)DOC"); + + py::enum_(build_strategy, "ReduceStrategy") + .value("Reduce", BuildStrategy::ReduceStrategy::kReduce) + .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce) + .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce); + py::enum_(build_strategy, + "GradientScaleStrategy") + .value("CoeffNumDevice", + BuildStrategy::GradientScaleStrategy::kCoeffNumDevice) + .value("One", BuildStrategy::GradientScaleStrategy::kOne) + .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized); + + build_strategy.def(py::init()) + .def("_clear_finalized", &BuildStrategy::ClearFinalized) + .def_property( + "reduce_strategy", + [](const BuildStrategy &self) { return self.reduce_; }, + [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.reduce_ = strategy; + }, + R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce + strategies in ParallelExecutor, AllReduce and Reduce. If you want + that all the parameters' optimization are done on all devices independently, + you should choose AllReduce; otherwise, if you choose Reduce, all the parameters' + optimization will be evenly distributed to different devices, and then + broadcast the optimized parameter to other devices. + Default is 'AllReduce'. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce + )DOC") + .def_property( + "gradient_scale_strategy", + [](const BuildStrategy &self) { return self.gradient_scale_; }, + [](BuildStrategy &self, + BuildStrategy::GradientScaleStrategy strategy) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.gradient_scale_ = strategy; + }, + R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three + ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice, + One and Customized. By default, ParallelExecutor sets the :math:`loss@grad` + according to the number of devices. If you want to customize :math:`loss@grad`, + you can choose Customized. Default is 'CoeffNumDevice'. + + Examples: + .. code-block:: python + + import numpy + import os + import paddle + import paddle.static as static + + paddle.enable_static() + + use_cuda = True + place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + exe = static.Executor(place) + + # NOTE: If you use CPU to run the program, you need + # to specify the CPU_NUM, otherwise, paddle will use + # all the number of the logic core as the CPU_NUM, + # in that case, the batch size of the input should be + # greater than CPU_NUM, if not, the process will be + # failed by an exception. + if not use_cuda: + os.environ['CPU_NUM'] = str(2) + places = static.cpu_places() + else: + places = static.cuda_places() + + data = static.data(name='X', shape=[None, 1], dtype='float32') + hidden = static.nn.fc(input=data, size=10) + loss = paddle.mean(hidden) + paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) + + exe.run(static.default_startup_program()) + + build_strategy = static.BuildStrategy() + build_strategy.gradient_scale_strategy = \ + static.BuildStrategy.GradientScaleStrategy.Customized + compiled_prog = static.CompiledProgram( + static.default_main_program()).with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy, + places=places) + + dev_count = len(places) + x = numpy.random.random(size=(10, 1)).astype('float32') + loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01 + loss_grad_name = loss.name+"@GRAD" + loss_data = exe.run(compiled_prog, + feed={"X": x, loss_grad_name : loss_grad}, + fetch_list=[loss.name, loss_grad_name]) + )DOC") + .def_property( + "debug_graphviz_path", + [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, + [](BuildStrategy &self, const std::string &path) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.debug_graphviz_path_ = path; + }, + R"DOC((str, optional): debug_graphviz_path indicates the path that + writing the SSA Graph to file in the form of graphviz. + It is useful for debugging. Default is empty string, that is, "" + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.debug_graphviz_path = "./graph" + )DOC") + .def_property( + "enable_sequential_execution", + [](const BuildStrategy &self) { + return self.enable_sequential_execution_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.enable_sequential_execution_ = b; + }, + R"DOC((bool, optional): If set True, the execution order of ops would + be the same as what is in the program. Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.enable_sequential_execution = True + )DOC") + .def_property( + "remove_unnecessary_lock", + [](const BuildStrategy &self) { + return self.remove_unnecessary_lock_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.remove_unnecessary_lock_ = b; + }, + R"DOC((bool, optional): If set True, some locks in GPU ops would be + released and ParallelExecutor would run faster. Default is True. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.remove_unnecessary_lock = True + )DOC") + .def_property( + "num_trainers", + [](const BuildStrategy &self) { return self.num_trainers_; }, + [](BuildStrategy &self, int num_trainers) { +#ifdef WIN32 + PADDLE_THROW(platform::errors::Unavailable( + "Distribution mode is not supported on Windows platform.")); +#endif + self.num_trainers_ = num_trainers; + }) + .def_property( + "trainers_endpoints", + [](const BuildStrategy &self) { return self.trainers_endpoints_; }, + [](BuildStrategy &self, + const std::vector &trainers_endpoints) { + self.trainers_endpoints_ = trainers_endpoints; + }) + .def_property( + "trainer_id", + [](const BuildStrategy &self) { return self.trainer_id_; }, + [](BuildStrategy &self, int trainer_id) { + self.trainer_id_ = trainer_id; + }) + .def_property( + "nccl_comm_num", + [](const BuildStrategy &self) { return self.nccl_comm_num_; }, + [](BuildStrategy &self, int nccl_comm_num) { + self.nccl_comm_num_ = nccl_comm_num; + }) + .def_property( + "bkcl_comm_num", + [](const BuildStrategy &self) { return self.bkcl_comm_num_; }, + [](BuildStrategy &self, int bkcl_comm_num) { + self.bkcl_comm_num_ = bkcl_comm_num; + }) + .def_property( + "use_hierarchical_allreduce", + [](const BuildStrategy &self) { + return self.use_hierarchical_allreduce_; + }, + [](BuildStrategy &self, bool use) { + self.use_hierarchical_allreduce_ = use; + }) + .def_property( + "hierarchical_allreduce_inter_nranks", + [](const BuildStrategy &self) { + return self.hierarchical_allreduce_inter_nranks_; + }, + [](BuildStrategy &self, int nranks) { + self.hierarchical_allreduce_inter_nranks_ = nranks; + }) + + .def_property( + "fuse_elewise_add_act_ops", + [](const BuildStrategy &self) { + return self.fuse_elewise_add_act_ops_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_elewise_add_act_ops_ = b; + }, + R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether + to fuse elementwise_add_op and activation_op, + it may make the execution faster. Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_elewise_add_act_ops = True + )DOC") + .def_property( + "fuse_gemm_epilogue", + [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_gemm_epilogue_ = b; + }, + R"DOC((bool, optional): fuse_gemm_epilogue indicate whether + to fuse matmul_op, elemenewist_add_op and activation_op, + it may make the execution faster. Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_gemm_epilogue = True + )DOC") + .def_property( + "fuse_bn_act_ops", + [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_bn_act_ops_ = b; + }, + R"DOC((bool, optional): fuse_bn_act_ops indicate whether + to fuse batch_norm and activation_op, + it may make the execution faster. Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_bn_act_ops = True + )DOC") + .def_property( + "fuse_bn_add_act_ops", + [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_bn_add_act_ops_ = b; + }, + R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether + to fuse batch_norm, elementwise_add and activation_op, + it may make the execution faster. Default is True + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_bn_add_act_ops = True + )DOC") + .def_property( + "enable_auto_fusion", + [](const BuildStrategy &self) { return self.enable_auto_fusion_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.enable_auto_fusion_ = b; + }, + R"DOC((bool, optional): Whether to enable fusing subgraph to a + fusion_group. Now we only support fusing subgraph that composed + of elementwise-like operators, such as elementwise_add/mul + without broadcast and activations. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.enable_auto_fusion = True + )DOC") + .def_property( + "fuse_relu_depthwise_conv", + [](const BuildStrategy &self) { + return self.fuse_relu_depthwise_conv_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_relu_depthwise_conv_ = b; + }, + R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether + to fuse relu and depthwise_conv2d, + it will save GPU memory and may make the execution faster. + This options is only available in GPU devices. + Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_relu_depthwise_conv = True + )DOC") + .def_property( + "fuse_broadcast_ops", + [](const BuildStrategy &self) { + return self.fuse_broadcast_ops_ == true || + self.fuse_broadcast_ops_ == paddle::none; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, " + "cannot be configured again.")); + self.fuse_broadcast_ops_ = b; + }, + R"DOC((bool, optional): fuse_broadcast_op indicates whether + to fuse the broadcast ops. Note that, in Reduce mode, + fusing broadcast ops may make the program faster. Because + fusing broadcast OP equals delaying the execution of all + broadcast Ops, in this case, all nccl streams are used only + for NCCLReduce operations for a period of time. Default False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_broadcast_ops = True + )DOC") + .def_property( + "fuse_all_optimizer_ops", + [](const BuildStrategy &self) { + return self.fuse_all_optimizer_ops_ == true || + self.fuse_all_optimizer_ops_ == paddle::none; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, " + "cannot be configured again.")); + self.fuse_all_optimizer_ops_ = b; + }) + .def_property( + "sync_batch_norm", + [](const BuildStrategy &self) { return self.sync_batch_norm_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), + true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.sync_batch_norm_ = b; + }, + R"DOC((bool, optional): sync_batch_norm indicates whether to use + synchronous batch normalization which synchronizes the mean + and variance through multi-devices in training phase. + Current implementation doesn't support FP16 training and CPU. + And only synchronous on one machine, not all machines. + Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.sync_batch_norm = True + )DOC") + .def_property( + "memory_optimize", + [](const BuildStrategy &self) -> py::object { + if (self.memory_optimize_) { + return py::cast(self.memory_optimize_.get()); + } else { + return py::cast(nullptr); + } + }, + [](BuildStrategy &self, const py::handle &value) { + auto *py_obj = value.ptr(); + if (py_obj == nullptr || py_obj == Py_None) { + self.memory_optimize_ = paddle::none; + } else if (PyBool_Check(py_obj)) { + self.memory_optimize_ = (py_obj == Py_True); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "BuildStrategy.memory_optimize must be set to None, False " + "or True")); + } + }, + R"DOC((bool, optional): memory opitimize aims to save total memory + consumption, set to True to enable it. + + Default None. None means framework would choose to use or not use + this strategy automatically. Currently, None means that it is + enabled when GC is disabled, and disabled when GC is enabled. + True means enabling and False means disabling. Default is None. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.memory_optimize = True + + )DOC") + .def_property( + "is_distribution", + [](const BuildStrategy &self) { return self.is_distribution_; }, + [](BuildStrategy &self, bool b) { +#ifdef WIN32 + if (b) { + PADDLE_THROW(platform::errors::Unavailable( + "Distribution mode is not supported on Windows platform.")); + } +#else + self.is_distribution_ = b; +#endif + }) + .def_property( + "async_mode", + [](const BuildStrategy &self) { return self.async_mode_; }, + [](BuildStrategy &self, bool b) { self.async_mode_ = b; }) + .def_property( + "enable_inplace", + [](const BuildStrategy &self) { return self.enable_inplace_; }, + [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) + .def_property( + "enable_addto", + [](const BuildStrategy &self) { return self.enable_addto_; }, + [](BuildStrategy &self, bool b) { self.enable_addto_ = b; }) + .def_property( + "fuse_all_reduce_ops", + [](const BuildStrategy &self) { + return self.fuse_all_reduce_ops_ == true || + self.fuse_all_reduce_ops_ == paddle::none; + }, + [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; }) + .def_property( + "enable_backward_optimizer_op_deps", + [](const BuildStrategy &self) { + return self.enable_backward_optimizer_op_deps_; + }, + [](BuildStrategy &self, bool b) { + self.enable_backward_optimizer_op_deps_ = b; + }) + .def_property( + "cache_runtime_context", + [](const BuildStrategy &self) { return self.cache_runtime_context_; }, + [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; }) + .def_property( + "mkldnn_enabled_op_types", + [](const BuildStrategy &self) { + return self.mkldnn_enabled_op_types_; + }, + [](BuildStrategy &self, + const std::unordered_set &mkldnn_enabled_op_types) { + self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types; + }) + .def_property( + "fix_op_run_order", + [](const BuildStrategy &self) { return self.fix_op_run_order_; }, + [](BuildStrategy &self, bool fix_op_run_order) { + self.fix_op_run_order_ = fix_op_run_order; + }) + .def_property( + "allow_cuda_graph_capture", + [](const BuildStrategy &self) { + return self.allow_cuda_graph_capture_; + }, + [](BuildStrategy &self, bool allow_cuda_graph_capture) { + self.allow_cuda_graph_capture_ = allow_cuda_graph_capture; + }) + .def("_copy", + [](const BuildStrategy &self) { + auto new_bs = self; + new_bs.ClearFinalized(); + return new_bs; + }) + .def( + "_finalize_strategy_and_create_passes", + [](BuildStrategy &self) -> std::shared_ptr { + return self.CreatePassesFromStrategy(true); + }, + R"DOC(Allow user to customized passes. Normally model-specific + optimization passes should be defined in this way. BuildStrategy + cannot be updated after being finalized.)DOC"); + + m.def("_set_cached_executor_build_strategy", + [](int64_t program_id, const BuildStrategy &build_strategy) { + auto &cached_exe_info = framework::ExecutorInfoCache::Instance(); + cached_exe_info.SetBuildStrategy(program_id, build_strategy); + }); + + pe.def(py::init &, + const std::vector &, + const std::string &, + Scope *, + std::vector &, + const ExecutionStrategy &, + const BuildStrategy &, + ir::Graph *>()) + // NOTE: even we return a vec* to Python use reference policy. + // We still cannot get local_scope from this vector, since the element + // of vec will be freed by Python GC. We can only return Scope* + // one by one and mark them as reference. + .def( + "local_scopes", + [](ParallelExecutor &self) -> std::vector * { + return &self.GetLocalScopes(); + }, + py::return_value_policy::reference) + .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes) + .def("_need_create_local_exe_scopes", + &ParallelExecutor::NeedCreateLocalExeScope) + .def("feed_tensors_into_local_scopes", + &ParallelExecutor::FeedTensorsIntoLocalScopes) + .def("feed_and_split_tensor_into_local_scopes", + &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes) + .def("run", + [](ParallelExecutor &self, + const std::vector &fetch_tensors, + bool return_merged) -> py::object { + if (return_merged) { + paddle::framework::FetchList ret; + /*gil_scoped_release*/ { + pybind11::gil_scoped_release release; + ret = self.RunAndMerge(fetch_tensors); + } + return py::cast(std::move(ret)); + } else { + paddle::framework::FetchUnmergedList ret; + /*gil_scoped_release*/ { + pybind11::gil_scoped_release release; + ret = self.Run(fetch_tensors); + } + return py::cast(std::move(ret)); + } + }) + .def("device_count", &ParallelExecutor::DeviceCount); + using VarQuantScale = + std::unordered_map>; + py::class_> pass(m, "Pass"); + pass.def(py::init()) + .def("has", &ir::Pass::Has) + .def("set_not_owned", + [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) { + self.SetNotOwned(attr_name, &attr); + }) + .def( + "set", + [](ir::Pass &self, const std::string &name, const std::string &attr) { + self.Set(name, new std::string(attr)); + }) + .def("set", + [](ir::Pass &self, const std::string &name, bool val) { + self.Set(name, new bool(val)); + }) + .def("set", + [](ir::Pass &self, const std::string &name, int val) { + self.Set(name, new int(val)); + }) + .def("set", + [](ir::Pass &self, + const std::string &name, + std::vector set) { + self.Set(name, new std::vector(set)); + }) + .def("set", + [](ir::Pass &self, + const std::string &name, + std::unordered_set set) { + self.Set(name, new std::unordered_set(set)); + }) + .def("set", + [](ir::Pass &self, + const std::string &name, + std::unordered_set set) { + self.Set(name, new std::unordered_set(set)); + }) + .def("set", + [](ir::Pass &self, const std::string &name, VarQuantScale scales) { + self.Set(name, new VarQuantScale(scales)); + }) + .def("type", &ir::Pass::Type) + .def("apply", [](ir::Pass &self, std::shared_ptr graph) { + self.Apply(graph.get()); + }); + + py::class_> pb( + m, "PassBuilder"); + pb.def(py::init()) + .def("append_pass", + [](ir::PassBuilder &self, + const std::string &pass_type) -> std::shared_ptr { + return self.AppendPass(pass_type); + }) + .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); }) + .def("insert_pass", + [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) { + return self.InsertPass(idx, pass_type); + }) + .def("remove_pass", + [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/parallel_executor.h b/paddle/fluid/pybind/parallel_executor.h new file mode 100644 index 0000000000000..3c3acace033a7 --- /dev/null +++ b/paddle/fluid/pybind/parallel_executor.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" + +namespace paddle { +namespace pybind { + +void BindParallelExecutor(pybind11::module& m); // NOLINT + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc new file mode 100644 index 0000000000000..84dca60c210f2 --- /dev/null +++ b/paddle/fluid/pybind/place.cc @@ -0,0 +1,816 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include +#include +#include +#include +#include +#include +#include // NOLINT // for call_once +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/executor_cache.h" +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" +#include "paddle/fluid/framework/ir/cost_model.h" +#include "paddle/fluid/framework/ir/generate_pass.h" +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/new_executor/executor_statistics.h" +#include "paddle/fluid/framework/new_executor/standalone_executor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/framework/phi_utils.h" +#include "paddle/fluid/framework/prune.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/save_load_util.h" +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/selected_rows_utils.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/trainer.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/memory/allocation/allocator_strategy.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#endif +#include "paddle/fluid/memory/allocation/mmap_allocator.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/operators/py_func_op.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_python.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/profiler.h" +#include "paddle/fluid/pybind/cuda_streams_py.h" +#include "paddle/fluid/pybind/distributed_py.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/pybind/io.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/lod_utils.h" +#include "paddle/utils/none.h" +#ifdef PADDLE_WITH_ASCEND +#include "paddle/fluid/pybind/ascend_wrapper_py.h" +#endif +#include "paddle/fluid/pybind/bind_cost_model.h" +#include "paddle/fluid/pybind/bind_fleet_executor.h" +#include "paddle/fluid/pybind/box_helper_py.h" +#include "paddle/fluid/pybind/communication.h" +#include "paddle/fluid/pybind/compatible.h" +#include "paddle/fluid/pybind/const_value.h" +#include "paddle/fluid/pybind/data_set_py.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/fleet_wrapper_py.h" +#include "paddle/fluid/pybind/generator_py.h" +#include "paddle/fluid/pybind/global_value_getter_setter.h" +#include "paddle/fluid/pybind/gloo_context_py.h" +#include "paddle/fluid/pybind/gloo_wrapper_py.h" +#include "paddle/fluid/pybind/heter_wrapper_py.h" +#include "paddle/fluid/pybind/inference_api.h" +#include "paddle/fluid/pybind/ir.h" +#include "paddle/fluid/pybind/metrics_py.h" +#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/phi/backends/device_manager.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/pybind/nccl_wrapper_py.h" +#endif +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/pybind/protobuf.h" +#include "paddle/fluid/pybind/pybind.h" // NOLINT +#include "paddle/fluid/pybind/reader_py.h" +#include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/fluid/string/to_string.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#ifndef PADDLE_WITH_HIP +#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" +#endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#endif + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#endif + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" +#endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/capi/capi.h" +#endif + +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#endif + +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif + +#ifdef PADDLE_WITH_CRYPTO +#include "paddle/fluid/pybind/crypto.h" +#endif + +#if defined PADDLE_WITH_PSCORE +#include "paddle/fluid/pybind/fleet_py.h" +#endif + +#ifdef PADDLE_WITH_CINN +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#endif + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/imperative/layout_autotune.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/place.h" +#include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" +#include "pybind11/stl.h" + +DECLARE_bool(use_mkldnn); + +// disable auto conversion to list in Python +PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); + +namespace paddle { +namespace pybind { +PyTypeObject *g_place_pytype = nullptr; +PyTypeObject *g_customplace_pytype = nullptr; +PyTypeObject *g_cudaplace_pytype = nullptr; +PyTypeObject *g_cpuplace_pytype = nullptr; +PyTypeObject *g_xpuplace_pytype = nullptr; +PyTypeObject *g_npuplace_pytype = nullptr; +PyTypeObject *g_cudapinnedplace_pytype = nullptr; +PyTypeObject *g_mluplace_pytype = nullptr; + +template +static inline int PlaceIndex(const PlaceType &p) { // NOLINT + return static_cast(paddle::platform::Place(p).GetType()); +} + +template +static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) { + return paddle::platform::Place(p1) == paddle::platform::Place(p2); +} + +void BindPlace(pybind11::module &m) { // NOLINT + using namespace paddle::framework; // NOLINT + py::class_ customplace(m, + "CustomPlace", + R"DOC( + CustomPlace is a descriptor of a device. + It represents a custom device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + + import paddle + fake_cpu_place = paddle.CustomPlace("FakeCPU", 0) + )DOC"); + g_customplace_pytype = reinterpret_cast(customplace.ptr()); + customplace + .def("__init__", + [](platform::CustomPlace &self, + const std::string &device_type, + int dev_id) { +#ifdef PADDLE_WITH_CUSTOM_DEVICE + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), device id must be 0 " + "or " + "positive integer", + device_type, + dev_id); + std::exit(-1); + } + + if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) && + phi::DeviceManager::IsCustom(device_type))) { + int dev_count = static_cast( + phi::DeviceManager::GetDeviceCount(device_type)); + if (UNLIKELY(dev_id >= dev_count)) { + if (dev_count == 0) { + LOG(ERROR) << "Cannot use " << device_type + << " because there is no " << device_type + << " detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), dev_id must " + "inside " + "[0, %d), because %s " + "number on your machine is %d", + device_type, + dev_id, + dev_count, + device_type, + dev_count); + std::exit(-1); + } + } + new (&self) platform::CustomPlace(device_type, dev_id); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid CustomPlace(%s, %d), the device type is " + "not registered " + "as a custom device.", + device_type, + dev_id); + std::exit(-1); + } +#else + LOG(ERROR) << string::Sprintf( + "Cannot use CustomDevice because you have installed CPU/GPU" + "version PaddlePaddle.\n" + "If you want to use CustomDevice, please try to install" + "CustomDevice version " + "PaddlePaddle by: pip install paddlepaddle\n" + "If you only have CPU, please change " + "CustomPlace(%s, %d) to be CPUPlace().\n", + device_type, dev_id); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("get_device_id", + [](const platform::CustomPlace &self) { return self.GetDeviceId(); }) + .def("get_device_type", + [](const platform::CustomPlace &self) { + return self.GetDeviceType(); + }) + .def("__repr__", string::to_string) + .def("__str__", string::to_string); + py::class_ cudaplace(m, "CUDAPlace", R"DOC( + + CUDAPlace is a descriptor of a device. + It represents a GPU device allocated or to be allocated with Tensor or LoDTensor. + Each CUDAPlace has a dev_id to indicate the graphics card ID represented by the current CUDAPlace, + staring from 0. + The memory of CUDAPlace with different dev_id is not accessible. + Numbering here refers to the logical ID of the visible graphics card, not the actual ID of the graphics card. + You can set visible GPU devices by setting the `CUDA_VISIBLE_DEVICES` environment variable. + When the program starts, visible GPU devices will be numbered from 0. + If `CUDA_VISIBLE_DEVICES` is not set, all devices are visible by default, + and the logical ID is the same as the actual ID. + + Parameters: + id (int): GPU device ID. + + Examples: + .. code-block:: python + + import paddle + + place = paddle.CUDAPlace(0) + + )DOC"); + g_cudaplace_pytype = reinterpret_cast(cudaplace.ptr()); + cudaplace + .def("__init__", + [](platform::CUDAPlace &self, int dev_id) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid CUDAPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + + if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) { + if (platform::GetGPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use GPU because there is no GPU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid CUDAPlace(%d), must inside [0, %d), because GPU " + "number on your machine is %d", + dev_id, + platform::GetGPUDeviceCount(), + platform::GetGPUDeviceCount()); + std::exit(-1); + } + } + + new (&self) platform::CUDAPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use GPU because you have installed CPU version " + "PaddlePaddle.\n" + "If you want to use GPU, please try to install GPU version " + "PaddlePaddle by: pip install paddlepaddle-gpu\n" + "If you only have CPU, please change CUDAPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + .def("get_device_id", + [](const platform::CUDAPlace &self) { return self.GetDeviceId(); }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_get_device_id", + [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); }) +#endif + .def("__repr__", string::to_string) + .def("__str__", string::to_string); + + py::class_ xpuplace(m, "XPUPlace", R"DOC( + **Note**: + Examples: + .. code-block:: python + import paddle.fluid as fluid + xpu_place = fluid.XPUPlace(0) + )DOC"); + g_xpuplace_pytype = reinterpret_cast(xpuplace.ptr()); + xpuplace + .def("__init__", + [](platform::XPUPlace &self, int dev_id) { +#ifdef PADDLE_WITH_XPU + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid XPUPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) { + if (platform::GetXPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use XPU because there is no XPU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid XPUPlace(%d), must inside [0, %d), because XPU " + "number on your machine is %d", + dev_id, + platform::GetXPUDeviceCount(), + platform::GetXPUDeviceCount()); + std::exit(-1); + } + } + new (&self) platform::XPUPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use XPU because you have installed CPU/GPU version " + "PaddlePaddle.\n" + "If you want to use XPU, please try to install XPU version " + "PaddlePaddle by: pip install paddlepaddle-xpu\n" + "If you only have CPU, please change XPUPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) +#ifdef PADDLE_WITH_XPU + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("get_device_id", + [](const platform::XPUPlace &self) { return self.GetDeviceId(); }) +#endif + .def("__repr__", string::to_string) + .def("__str__", string::to_string); +#ifdef PADDLE_WITH_XPU + py::enum_(m, "XPUVersion", py::arithmetic()) + .value("XPU1", phi::backends::xpu::XPUVersion::XPU1) + .value("XPU2", phi::backends::xpu::XPUVersion::XPU2) + .export_values(); + m.def("get_xpu_device_count", platform::GetXPUDeviceCount); + m.def("get_xpu_device_version", + [](int device_id) { return platform::get_xpu_version(device_id); }); +#ifdef PADDLE_WITH_XPU_KP + m.def("get_xpu_device_op_support_types", + [](const std::string &op_name, phi::backends::xpu::XPUVersion version) { + return platform::get_xpu_kp_op_support_type(op_name, version); + }); +#else + m.def("get_xpu_device_op_support_types", + [](const std::string &op_name, phi::backends::xpu::XPUVersion version) { + return platform::get_xpu_op_support_type(op_name, version); + }); +#endif + m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) { + return platform::get_xpu_op_list(version); + }); + m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool { + // XPUs with Compute Capability > xpu2 support float16 and bfloat16 + return platform::get_xpu_version(place.device) > + phi::backends::xpu::XPUVersion::XPU1; + }); + m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool { + // XPUs with Compute Capability > xpu2 support float16 and bfloat16 + return platform::get_xpu_version(place.device) > + phi::backends::xpu::XPUVersion::XPU1; + }); +#endif + + py::class_ cpuplace(m, "CPUPlace", R"DOC( + CPUPlace is a descriptor of a device. + It represents a CPU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + + import paddle + cpu_place = paddle.CPUPlace() + + )DOC"); + g_cpuplace_pytype = reinterpret_cast(cpuplace.ptr()); + cpuplace.def(py::init<>()) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("__repr__", string::to_string) + .def("__str__", string::to_string); + + py::class_ cudapinnedplace( + m, "CUDAPinnedPlace", R"DOC( + CUDAPinnedPlace is a descriptor of a device. + It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory. + The host operating system will not paging and exchanging the memory. + It can be accessed through direct memory access technology to speed up the copy of data between the host and GPU. + For more information on CUDA data transfer and `pinned memory`, + please refer to `official document `_ . + + Examples: + .. code-block:: python + + import paddle + place = paddle.CUDAPinnedPlace() + + )DOC"); + g_cudapinnedplace_pytype = + reinterpret_cast(cudapinnedplace.ptr()); + cudapinnedplace + .def("__init__", + [](platform::CUDAPinnedPlace &self) { +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use CUDAPinnedPlace in CPU only version, " + "Please recompile or reinstall Paddle with CUDA support.")); +#endif + new (&self) platform::CUDAPinnedPlace(); + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("__repr__", string::to_string) + .def("__str__", string::to_string); + + // NPUPlace + py::class_ npuplace(m, "NPUPlace", R"DOC( + NPUPlace is a descriptor of a device. + It represents a NPU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + import paddle + npu_place = paddle.NPUPlace(0) + + )DOC"); + g_npuplace_pytype = reinterpret_cast(npuplace.ptr()); + npuplace + .def("__init__", + [](platform::NPUPlace &self, int dev_id) { +#ifdef PADDLE_WITH_ASCEND_CL + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid NPUPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) { + if (platform::GetNPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use NPU because there is no NPU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid NPUPlace(%d), must inside [0, %d), because NPU " + "number on your machine is %d", + dev_id, + platform::GetNPUDeviceCount(), + platform::GetNPUDeviceCount()); + std::exit(-1); + } + } + new (&self) platform::NPUPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use NPU because you have installed CPU/GPU version " + "PaddlePaddle.\n" + "If you want to use NPU, please try to install NPU version " + "PaddlePaddle by: pip install paddlepaddle-npu\n" + "If you only have CPU, please change NPUPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("get_device_id", + [](const platform::NPUPlace &self) { return self.GetDeviceId(); }) + .def("__str__", string::to_string); + + // IPUPlace + py::class_(m, "IPUPlace", R"DOC( + IPUPlace is a descriptor of a device. + It represents a IPU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + import paddle + + # required: ipu + + ipu_place = paddle.IPUPlace() + + )DOC") + .def("__init__", + [](platform::IPUPlace &self) { +#ifdef PADDLE_WITH_IPU + if (platform::GetIPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use IPU because there is no IPU " + "detected on your " + "machine."; + std::exit(-1); + } + // use ipu(0) to comile, while run with the number user configure + // in sharding and pipline. + new (&self) platform::IPUPlace(0); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use IPU because you didn't install IPU version " + "PaddlePaddle.\n" + "If you want to use IPU, please try to install IPU version " + "PaddlePaddle by: pip install paddlepaddle*\n" + "If you only have CPU, please change IPUPlace to be " + "CPUPlace().\n"); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) +#ifdef PADDLE_WITH_IPU + .def("get_device_id", + [](const platform::IPUPlace &self) { return self.GetDeviceId(); }) +#endif + .def("__str__", string::to_string); + + // MLUPlace + py::class_ mluplace(m, "MLUPlace", R"DOC( + MLUPlace is a descriptor of a device. + It represents a MLU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + import paddle + # required: mlu + mlu_place = paddle.MLUPlace(0) + + )DOC"); + g_mluplace_pytype = reinterpret_cast(mluplace.ptr()); + mluplace + .def("__init__", + [](platform::MLUPlace &self, int dev_id) { +#ifdef PADDLE_WITH_MLU + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid MLUPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) { + if (platform::GetMLUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use MLU because there is no MLU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid MLUPlace(%d), must inside [0, %d), because MLU " + "number on your machine is %d", + dev_id, + platform::GetMLUDeviceCount(), + platform::GetMLUDeviceCount()); + std::exit(-1); + } + } + new (&self) platform::MLUPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use MLU because you have installed CPU/GPU/... " + "version " + "PaddlePaddle.\n" + "If you want to use MLU, please try to install MLU version " + "PaddlePaddle by: pip install paddlepaddle-mlu\n" + "If you only have CPU, please change MLUPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) +#ifdef PADDLE_WITH_MLU + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("get_device_id", + [](const platform::MLUPlace &self) { return self.GetDeviceId(); }) +#endif + .def("__str__", string::to_string); + + py::class_ platformplace(m, "Place"); + g_place_pytype = reinterpret_cast(platformplace.ptr()); + platformplace.def(py::init<>()) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("is_gpu_place", + [](platform::Place &self) { return platform::is_gpu_place(self); }) + .def("is_cpu_place", + [](platform::Place &self) { return platform::is_cpu_place(self); }) + .def("is_xpu_place", + [](platform::Place &self) { return platform::is_xpu_place(self); }) + .def("is_npu_place", + [](platform::Place &self) { return platform::is_npu_place(self); }) + .def("is_ipu_place", + [](platform::Place &self) { return platform::is_ipu_place(self); }) + .def("is_cuda_pinned_place", + [](platform::Place &self) { + return platform::is_cuda_pinned_place(self); + }) + .def("is_mlu_place", + [](platform::Place &self) { return platform::is_mlu_place(self); }) + .def( + "is_custom_place", + [](platform::Place &self) { return platform::is_custom_place(self); }) + .def("gpu_device_id", [](platform::Place &self) { return self.device; }) + .def("xpu_device_id", [](platform::Place &self) { return self.device; }) + .def("npu_device_id", [](platform::Place &self) { return self.device; }) + .def("ipu_device_id", [](platform::Place &self) { return self.device; }) + .def("mlu_device_id", [](platform::Place &self) { return self.device; }) + .def("custom_device_id", + [](platform::Place &self) { return self.device; }) + .def("set_place", + [](platform::Place &self, const platform::Place &other) { + self = other; + }) + .def("set_place", + [](platform::Place &self, const platform::CPUPlace &cpu_place) { + self = cpu_place; + }) + .def("set_place", + [](platform::Place &self, const platform::XPUPlace &xpu_place) { + self = xpu_place; + }) + .def("set_place", + [](platform::Place &self, const platform::CUDAPlace &gpu_place) { + self = gpu_place; + }) + .def("set_place", + [](platform::Place &self, + const platform::CUDAPinnedPlace &cuda_pinned_place) { + self = cuda_pinned_place; + }) + .def("set_place", + [](platform::Place &self, const platform::NPUPlace &npu_place) { + self = npu_place; + }) + .def("set_place", + [](platform::Place &self, const platform::IPUPlace &ipu_place) { + self = ipu_place; + }) + .def("set_place", + [](platform::Place &self, const platform::MLUPlace &mlu_place) { + self = mlu_place; + }) + .def("set_place", + [](platform::Place &self, const platform::CustomPlace &plug_place) { + self = plug_place; + }) + .def("__repr__", string::to_string) + .def("__str__", string::to_string); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/place.h b/paddle/fluid/pybind/place.h new file mode 100644 index 0000000000000..40fb8d4c7f472 --- /dev/null +++ b/paddle/fluid/pybind/place.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" + +namespace paddle { +namespace pybind { + +void BindPlace(pybind11::module& m); // NOLINT + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 62f0402bedc7a..40a03248cd22d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -122,9 +122,12 @@ limitations under the License. */ #include "paddle/fluid/pybind/nccl_wrapper_py.h" #endif #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/pybind/parallel_executor.h" +#include "paddle/fluid/pybind/place.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/reader_py.h" +#include "paddle/fluid/pybind/tensor.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -194,16 +197,7 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); namespace paddle { namespace pybind { -PyTypeObject *g_place_pytype = nullptr; PyTypeObject *g_framework_scope_pytype = nullptr; -PyTypeObject *g_cudaplace_pytype = nullptr; -PyTypeObject *g_cpuplace_pytype = nullptr; -PyTypeObject *g_xpuplace_pytype = nullptr; -PyTypeObject *g_npuplace_pytype = nullptr; -PyTypeObject *g_cudapinnedplace_pytype = nullptr; -PyTypeObject *g_mluplace_pytype = nullptr; -PyTypeObject *g_customplace_pytype = nullptr; -PyTypeObject *g_framework_tensor_pytype = nullptr; PyTypeObject *g_framework_lodtensorarray_pytype = nullptr; PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr; @@ -349,16 +343,6 @@ bool IsCompiledWithDIST() { #endif } -template -static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) { - return paddle::platform::Place(p1) == paddle::platform::Place(p2); -} - -template -static inline int PlaceIndex(const PlaceType &p) { - return static_cast(paddle::platform::Place(p).GetType()); -} - static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) { // NOTE(zjl): PyObject_GetAttrString would return nullptr when attr_name // is not inside obj, but it would also set the error flag of Python. @@ -541,19 +525,6 @@ static int GetNCCLVersion() { } #endif -template -static void TensorCopyFrom(framework::Tensor *dst, - const framework::Tensor &src, - const PlaceType &place, - int64_t batch_size) { - if (batch_size < 0) { - framework::TensorCopy(src, place, dst); - } else { - auto sliced = src.Slice(0, batch_size); - framework::TensorCopy(sliced, place, dst); - } -} - #ifdef PADDLE_WITH_AVX PYBIND11_MODULE(core_avx, m) { #else @@ -854,897 +825,6 @@ PYBIND11_MODULE(core_noavx, m) { self.EmplaceBackAttr(attr); }); - py::class_ framework_tensor( - m, "Tensor", py::buffer_protocol()); - g_framework_tensor_pytype = - reinterpret_cast(framework_tensor.ptr()); - framework_tensor - .def("__array__", - [](framework::Tensor &self) { return TensorToPyArray(self); }) - .def("_ptr", - [](const framework::Tensor &self) { - return reinterpret_cast(self.data()); - }) - .def("_slice", &framework::Tensor::Slice) - .def("_numel", &framework::Tensor::numel) - .def("_is_initialized", - [](const framework::Tensor &self) { return self.IsInitialized(); }) - .def("_get_dims", - [](const framework::Tensor &self) { return vectorize(self.dims()); }) - .def("_set_dims", - [](framework::Tensor &self, const std::vector &dim) { - self.Resize(phi::make_ddim(dim)); - }) - .def("_set_layout", - [](framework::Tensor &self, const std::string &layout) { - self.set_layout(StringToDataLayout(layout)); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::CustomPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::CUDAPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::XPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::CPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::NPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, paddle::platform::MLUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_double", - [](framework::Tensor &self, paddle::platform::CPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, paddle::platform::CPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, paddle::platform::CustomPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, paddle::platform::XPUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, paddle::platform::CUDAPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, paddle::platform::MLUPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_int", - [](framework::Tensor &self, - paddle::platform::CUDAPinnedPlace &place) { - self.mutable_data(place); - }) - .def("_alloc_float", - [](framework::Tensor &self, - paddle::platform::CUDAPinnedPlace &place) { - self.mutable_data(place); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::CPUPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::CustomPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::XPUPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::CUDAPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::CUDAPinnedPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::MLUPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_clear", &framework::Tensor::clear) - .def("_mutable_data", - [](framework::Tensor &self, - paddle::platform::NPUPlace &place, - paddle::framework::proto::VarType::Type type) { - return reinterpret_cast( - self.mutable_data(place, framework::TransToPhiDataType(type))); - }) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("_copy_from", - &TensorCopyFrom, - py::arg("tensor"), - py::arg("place"), - py::arg("batch_size") = -1) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false) - .def("set", - SetTensorFromPyArray, - py::arg("array"), - py::arg("place"), - py::arg("zero_copy") = false, - R"DOC( - Set the data of Tensor on place with given numpy array. - - Args: - lod (numpy.ndarray): The data to set. - place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the - Tensor is to be set. - zero_copy (bool, optional): Whether to share memory with the input numpy array. - This parameter only works with CPUPlace. Default: False. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - )DOC") - - .def( - "shape", - [](framework::Tensor &self) { return vectorize(self.dims()); }, - R"DOC( - Return the shape of Tensor. - - Returns: - list[int]: The shape of Tensor. - - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - print(t.shape()) # [5, 30] - )DOC") - .def("_to_dlpack", - [](framework::Tensor &self) { - DLPackTensor dlpack_tensor(self, 1); - DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor(); - auto capsule = py::capsule( - static_cast(dmt), "dltensor", [](PyObject *ptr) { - if (ptr) { - auto dltensor = new DLManagedTensor; - try { - dltensor = reinterpret_cast( - PyCapsule_GetPointer(ptr, "used_dltensor")); - return; - } catch (...) { - dltensor = reinterpret_cast( - PyCapsule_GetPointer(ptr, "dltensor")); - } - dltensor->deleter(dltensor); - } - }); - return capsule; - }) - .def("_set_float_element", TensorSetElement) - .def("_get_float_element", TensorGetElement) - .def("_set_double_element", TensorSetElement) - .def("_get_double_element", TensorGetElement) - .def("_place", [](framework::Tensor &self) { return self.place(); }) - .def("_dtype", - [](framework::Tensor &self) { - return framework::TransToProtoVarType(self.type()); - }) - .def("_layout", - [](framework::Tensor &self) { - return DataLayoutToString(self.layout()); - }) - .def("_share_data_with", &framework::Tensor::ShareDataWith) - .def("__getitem__", PySliceTensor, py::return_value_policy::reference) - .def("__str__", - [](const framework::Tensor &self) { - std::stringstream ostr; - ostr << self; - return ostr.str(); - }) /* ------ End of original Tensor ------ */ - .def("__init__", - [](framework::Tensor &instance, - const std::vector> - &recursive_sequence_lengths) { - LoD new_lod; - new_lod.reserve(recursive_sequence_lengths.size()); - std::copy(recursive_sequence_lengths.begin(), - recursive_sequence_lengths.end(), - std::back_inserter(new_lod)); - LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); - PADDLE_ENFORCE_EQ( - CheckLoD(new_offset_lod, -1), - true, - platform::errors::InvalidArgument( - "The provided recursive_sequence_lengths info is " - "invalid, " - "the LoD converted by recursive_sequence_lengths is %s", - new_lod)); - new (&instance) framework::Tensor(new_offset_lod); - }) - .def("__init__", - [](framework::Tensor &instance) { - new (&instance) framework::Tensor(); - }) - // We implement offset based LOD in C++ while we use length based with - // Python API. So we changed set_lod to set_recursive_sequence_lengths - // to - // avoid misuse. - // The discussion is here: - // https://github.com/PaddlePaddle/Paddle/issues/10855 - .def( - "set_lod", - [](framework::Tensor &self, - const std::vector> &lod) { - // the input lod is offset-based level-of-detail info - LoD new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - PADDLE_ENFORCE_EQ( - CheckLoD(new_lod, vectorize(self.dims()).front()), - true, - platform::errors::InvalidArgument( - "The provided LoD is invalid, the LoD is %s", new_lod)); - self.set_lod(new_lod); - }, - py::arg("lod"), - R"DOC( - Set LoD of the Tensor. - - Args: - lod (list[list[int]]): The lod to set. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_lod([[0, 2, 5]]) - print(t.lod()) # [[0, 2, 5]] - )DOC") - .def( - "set_recursive_sequence_lengths", - [](framework::Tensor &self, - const std::vector> - &recursive_sequence_lengths) { - // the input recursive_sequence_lengths is length-based - // level-of-detail info - LoD new_lod; - new_lod.reserve(recursive_sequence_lengths.size()); - std::copy(recursive_sequence_lengths.begin(), - recursive_sequence_lengths.end(), - std::back_inserter(new_lod)); - LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); - PADDLE_ENFORCE_EQ( - CheckLoD(new_offset_lod, vectorize(self.dims()).front()), - true, - platform::errors::InvalidArgument( - "The provided recursive_sequence_lengths info is " - "invalid, " - "the LoD converted by recursive_sequence_lengths is " - "%s", - new_lod)); - self.set_lod(new_offset_lod); - }, - py::arg("recursive_sequence_lengths"), - R"DOC( - Set LoD of the Tensor according to recursive sequence lengths. - - For example, if recursive_sequence_lengths=[[2, 3]], which means - there are two sequences with length 2 and 3 respectively, the - corresponding lod would be [[0, 2, 2+3]], i.e., [[0, 2, 5]]. - - Args: - recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths. - - Returns: - None. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_recursive_sequence_lengths([[2, 3]]) - print(t.recursive_sequence_lengths()) # [[2, 3]] - print(t.lod()) # [[0, 2, 5]] - )DOC") - .def( - "lod", - [](framework::Tensor &self) -> std::vector> { - // output the offset-based lod info - LoD lod = self.lod(); - std::vector> new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - return new_lod; - }, - R"DOC( - Return the LoD of the Tensor. - - Returns: - list[list[int]]: The lod of the Tensor. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_lod([[0, 2, 5]]) - print(t.lod()) # [[0, 2, 5]] - )DOC") - // Set above comments of set_lod. - .def( - "recursive_sequence_lengths", - [](framework::Tensor &self) -> std::vector> { - // output the length-based lod info - LoD lod = phi::ConvertToLengthBasedLoD(self.lod()); - std::vector> new_lod; - new_lod.reserve(lod.size()); - std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - return new_lod; - }, - R"DOC( - Return the recursive sequence lengths corresponding to of the LodD - of the Tensor. - - Returns: - list[list[int]]: The recursive sequence lengths. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_recursive_sequence_lengths([[2, 3]]) - print(t.recursive_sequence_lengths()) # [[2, 3]] - )DOC") - .def( - "has_valid_recursive_sequence_lengths", - [](framework::Tensor &self) -> bool { - // Check that the lod info is valid and match the outermost - // dimension of the Tensor data - return CheckLoD(self.lod(), vectorize(self.dims()).front()); - }, - R"DOC( - Check whether the LoD of the Tensor is valid. - - Returns: - bool: Whether the LoD is valid. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy as np - - t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) - t.set_recursive_sequence_lengths([[2, 3]]) - print(t.has_valid_recursive_sequence_lengths()) # True - )DOC") - .def("_as_type", - [](const framework::Tensor &self, - paddle::framework::proto::VarType::Type type) { - framework::Tensor dst; - if (self.IsInitialized() && self.numel() > 0) { - TransDataType(self, type, &dst); - } - return dst; - }) - .def("_copy", - [](const framework::Tensor &self, const platform::Place &place) { - // follow fetch_op's inplementation - framework::Tensor dst; - if (self.IsInitialized() && self.numel() > 0) { - TensorCopySync(self, place, &dst); - } else { - // Not copy, if the src tensor is empty. - dst.clear(); - dst.Resize({0}); - } - dst.set_lod(self.lod()); - return dst; -#ifdef _WIN32 - }); -#else - }) -#ifdef PADDLE_WITH_CUDA - .def("_share_buffer_with", - [](framework::Tensor &self, const framework::Tensor src, - py::tuple t) { - auto *cuda_ipc_allocation = - dynamic_cast( - src.Holder().get()); - - PADDLE_ENFORCE_NOT_NULL( - cuda_ipc_allocation, - platform::errors::PreconditionNotMet( - "Tensor is not Cuda IPC shared tensor. " - "Now only Tensor shared by cuda ipc could use this " - "api.")); - - size_t size = t[0].cast(); - auto dtype = - static_cast(t[1].cast()); - auto dims = phi::make_ddim(t[2].cast>()); - auto lod_info = t[3].cast(); - auto device_id = t[4].cast(); - - auto shared_reader_holder = - std::make_shared( - cuda_ipc_allocation->ptr(), - cuda_ipc_allocation->base_ptr(), size, - platform::CUDAPlace(device_id)); - - self.ResetHolderWithType(shared_reader_holder, dtype); - self.Resize(dims); - self.set_lod(lod_info); - - VLOG(6) << "Reconstructed tensor with buffer shared!"; - }, - R"DOC( - Deserialize GPU Tensor for existed shared Cuda IPC tensor. - - Params: - tensor: Shared Cuda IPC tensor. - tuple: contrains data size, data type, - tensor dims, lod information, device index. - - )DOC") - .def("_share_cuda", - [](framework::Tensor self) { - if (!self.IsInitialized() || self.numel() == 0) - throw std::runtime_error( - "Tensor not initialized or numel is 0. could not pass " - "to shared memory. "); - - auto *holder = dynamic_cast( - self.Holder().get()); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(holder->place()), true, - platform::errors::InvalidArgument( - "Tensor is not on GPU. share_cuda only support GPU " - "Tensor, share_filename is for CPU tensor.")); - - void *base_ptr = holder->base_ptr(); - ptrdiff_t offset_bytes = reinterpret_cast(holder->ptr()) - - reinterpret_cast(base_ptr); - - cudaIpcMemHandle_t handle; - PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr)); - - auto _handle = py::bytes(reinterpret_cast(&handle), - (py::ssize_t)CUDA_IPC_HANDLE_SIZE); - - // TODO(ZHUI): use cuda event, to avoid sync. - const auto &device_id = paddle::platform::GetCurrentDeviceId(); - auto stream = - paddle::platform::stream::get_current_stream(device_id); - stream->Synchronize(); - - int type_idx = static_cast(self.type()); - size_t data_size = - self.numel() * - framework::SizeOfType( - framework::TransToProtoVarType(self.type())); - - return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size, - type_idx, vectorize(self.dims()), self.lod(), - device_id); - }, - R"DOC( - Serialize GPU Tensor by cudaIpcMemHandle. - - Returns: - tuple: contrains handle, data size, data type, - tensor dims, lod information, device index. - - Examples: - .. code-block:: python - - import paddle - tensor = paddle.ones([3,3]) - metainfo = tensor.value().get_tensor()._share_cuda() - - )DOC") - .def("_new_shared_cuda", - [](py::tuple t) { - if (t.size() != 7) - throw std::runtime_error( - "Invalid Tensor meta info for shared cuda tensor!"); - - // 1. Create a new C++ instance - framework::Tensor tensor; - - // 2. Rebuild Allocation from handle - const std::string &handle = t[0].cast(); - ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast(); - auto device_id = t[6].cast(); - auto base_ptr = memory::allocation::GetIpcBasePtr(handle); - size_t size = t[2].cast(); - void *dev = base_ptr.get(); - dev = reinterpret_cast(dev) + offset_bytes; - - auto shared_reader_holder = - std::make_shared( - dev, size, device_id, std::move(base_ptr)); - - // 3. Rebuild Tensor - tensor.ResetHolderWithType( - shared_reader_holder, - static_cast(t[3].cast())); - tensor.Resize(phi::make_ddim(t[4].cast>())); - tensor.set_lod(t[5].cast()); - - return tensor; - }, - R"DOC( - Deserialize GPU lod tensor from cudaIpcMemHandle. - - Params: - tuple: contrains handle, data size, data type, - tensor dims, lod information, device index. - - Examples: - .. code-block:: python - - import paddle - tensor = paddle.ones([3,3]) - metainfo = tensor.value().get_tensor()._share_cuda() - tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo)) - - )DOC") -#endif - .def("_share_filename", - [](framework::Tensor &self) { - if (!self.IsInitialized() || self.numel() == 0) - throw std::runtime_error( - "Tensor not initialized or numel is 0. could not pass to " - "shared memory. "); - - auto holder = self.Holder(); - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(holder->place()) || - platform::is_cuda_pinned_place(holder->place()), - true, platform::errors::InvalidArgument( - "Tensor is not on CPU. share_filename only " - "support CPU Tensor.")); - - auto *mmap_allocation = dynamic_cast< - memory::allocation::RefcountedMemoryMapAllocation *>( - holder.get()); - // If the tensor is not shared, allocate memory map allocation. - if (mmap_allocation == nullptr) { - void *data_ptr = self.data(); - size_t data_size = - self.numel() * - framework::SizeOfType( - framework::TransToProtoVarType(self.type())); - - int flags = memory::allocation::MAPPED_SHAREDMEM | - memory::allocation::MAPPED_EXCLUSIVE; - std::string handle = memory::allocation::GetIPCName(); - auto shared_holder = - memory::allocation::AllocateRefcountedMemoryMapAllocation( - handle, flags, data_size); - - // copy data & reset holder - if (platform::is_cuda_pinned_place(holder->place())) { -#ifdef PADDLE_WITH_CUDA - memory::Copy(platform::CPUPlace(), shared_holder->ptr(), - platform::CUDAPinnedPlace(), data_ptr, data_size); -#endif - } else { - memory::Copy(platform::CPUPlace(), shared_holder->ptr(), - platform::CPUPlace(), data_ptr, data_size); - } - self.ResetHolder(shared_holder); - mmap_allocation = shared_holder.get(); - } - int type_idx = static_cast(self.type()); - - return py::make_tuple(mmap_allocation->ipc_name(), - mmap_allocation->size(), type_idx, - vectorize(self.dims()), self.lod()); - }, - R"DOC( - Serialize CPU lod tensor in shared memory to tuple. - If the tensor is not in shared memory, we will copy it first. - - Returns: - tuple: contrains ipc name, data size, data type, - tensor dims and lod imformation. - - Examples: - .. code-block:: python - - import paddle - tensor = paddle.ones([3,3]) - metainfo = tensor.value().get_tensor()._share_filename() - - )DOC") - .def("_new_shared_filename", - [](py::tuple t) { // __setstate__ - if (t.size() != 5) - throw std::runtime_error("Invalid Tensor meta info state!"); - - framework::Tensor tensor; - - // 2. Rebuild Allocation - const std::string &ipc_name = t[0].cast(); - size_t size = t[1].cast(); - int flags = memory::allocation::MAPPED_SHAREDMEM | - memory::allocation::MAPPED_NOCREATE; - - auto shared_holder = - memory::allocation::AllocateRefcountedMemoryMapAllocation( - ipc_name, flags, size); - - // 3. Rebuild Tensor - tensor.ResetHolderWithType( - shared_holder, - static_cast(t[2].cast())); - tensor.Resize(phi::make_ddim(t[3].cast>())); - tensor.set_lod(t[4].cast()); - - return tensor; - }, - R"DOC( - Deserialize CPU lod tensor from shared memory. - - Params: - tuple: contrains ipc file name, data size, data type, - tensor dims and lod information. - - Examples: - .. code-block:: python - - import paddle - tensor = paddle.ones([3,3]) - metainfo = tensor.value().get_tensor()._share_filename() - tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo)) - - )DOC") - .def("_shared_incref", - [](framework::Tensor &self) { - auto *mmap_allocation = dynamic_cast< - memory::allocation::RefcountedMemoryMapAllocation *>( - self.Holder().get()); - if (mmap_allocation) { - mmap_allocation->incref(); - } - }, - R"DOC( - Increase reference count of share_filename tensor. - )DOC") - .def("_shared_decref", - [](framework::Tensor &self) { - auto *mmap_allocation = dynamic_cast< - memory::allocation::RefcountedMemoryMapAllocation *>( - self.Holder().get()); - if (mmap_allocation) { - mmap_allocation->decref(); - } - }, - R"DOC( - Decrease reference count of share_filename tensor. - )DOC") - .def(py::pickle( - [](const framework::Tensor &t) { // __getstate__ - auto holder = t.Holder(); - PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true, - platform::errors::PreconditionNotMet( - "Tensor is not on CPU." - "Now only Tensor on CPU can be serialized.")); - auto *mmap_writer_allocation = - dynamic_cast( - holder.get()); - PADDLE_ENFORCE_NOT_NULL( - mmap_writer_allocation, - platform::errors::PreconditionNotMet( - "Tensor is not in shared memory." - "Now only Tensor on shared memory can be serialized.")); - int type_idx = static_cast(t.type()); - - return py::make_tuple(mmap_writer_allocation->ipc_name(), - mmap_writer_allocation->size(), type_idx, - vectorize(t.dims()), t.lod()); - }, - [](py::tuple t) { // __setstate__ - if (t.size() != 5) - throw std::runtime_error("Invalid Tensor state!"); - - // 1. Create a new C++ instance - framework::Tensor tensor; - - // 2. Rebuild Allocation - const std::string &ipc_name = t[0].cast(); - size_t size = t[1].cast(); - auto shared_reader_holder = - memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name, - size); - - // 3. Maintain global fd set - VLOG(3) << "Tensor ipc name: " << ipc_name; - memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); - - // 4. Rebuild Tensor - tensor.ResetHolderWithType( - shared_reader_holder, - static_cast(t[2].cast())); - tensor.Resize(phi::make_ddim(t[3].cast>())); - tensor.set_lod(t[4].cast()); - - return tensor; - })); -#endif - - py::class_(m, "SelectedRows") - .def("__init__", - [](phi::SelectedRows &instance) { - new (&instance) phi::SelectedRows(); - }) - .def("__init__", - [](phi::SelectedRows &instance, - const std::vector rows, - const int64_t &height) { - new (&instance) phi::SelectedRows(rows, height); - }) - .def( - "get_tensor", - [](phi::SelectedRows &self) { return self.mutable_value(); }, - py::return_value_policy::reference) - .def("numel", - [](phi::SelectedRows &self) -> int64_t { - return self.value().numel(); - }) - .def("set_height", &phi::SelectedRows::set_height) - .def("height", &phi::SelectedRows::height) - .def("set_rows", - [](phi::SelectedRows &self, std::vector rows) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) - self.set_rows(rows); -#else - Vector new_rows(rows); - self.set_rows(new_rows); -#endif - }) - .def("sync_index", - [](phi::SelectedRows &instance) { instance.SyncIndex(); }) - .def("rows", [](phi::SelectedRows &self) { - auto rows = self.rows(); - std::vector new_rows; - new_rows.reserve(rows.size()); - std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows)); - return new_rows; - }); - py::class_(m, "Variable", R"DOC(Variable Class. All parameter, weight, gradient are variables in Paddle. @@ -2272,603 +1352,6 @@ All parameter, weight, gradient are variables in Paddle. #endif return devices; }); - py::class_ customplace(m, - "CustomPlace", - R"DOC( - CustomPlace is a descriptor of a device. - It represents a custom device on which a tensor will be allocated and a model will run. - - Examples: - .. code-block:: python - - import paddle - fake_cpu_place = paddle.CustomPlace("FakeCPU", 0) - )DOC"); - g_customplace_pytype = reinterpret_cast(customplace.ptr()); - customplace - .def("__init__", - [](platform::CustomPlace &self, - const std::string &device_type, - int dev_id) { -#ifdef PADDLE_WITH_CUSTOM_DEVICE - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid CustomPlace(%s, %d), device id must be 0 " - "or " - "positive integer", - device_type, - dev_id); - std::exit(-1); - } - - if (LIKELY(phi::DeviceManager::HasDeviceType(device_type) && - phi::DeviceManager::IsCustom(device_type))) { - int dev_count = static_cast( - phi::DeviceManager::GetDeviceCount(device_type)); - if (UNLIKELY(dev_id >= dev_count)) { - if (dev_count == 0) { - LOG(ERROR) << "Cannot use " << device_type - << " because there is no " << device_type - << " detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid CustomPlace(%s, %d), dev_id must " - "inside " - "[0, %d), because %s " - "number on your machine is %d", - device_type, - dev_id, - dev_count, - device_type, - dev_count); - std::exit(-1); - } - } - new (&self) platform::CustomPlace(device_type, dev_id); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid CustomPlace(%s, %d), the device type is " - "not registered " - "as a custom device.", - device_type, - dev_id); - std::exit(-1); - } -#else - LOG(ERROR) << string::Sprintf( - "Cannot use CustomDevice because you have installed CPU/GPU" - "version PaddlePaddle.\n" - "If you want to use CustomDevice, please try to install" - "CustomDevice version " - "PaddlePaddle by: pip install paddlepaddle\n" - "If you only have CPU, please change " - "CustomPlace(%s, %d) to be CPUPlace().\n", - device_type, dev_id); - std::exit(-1); -#endif - }) - .def("_type", &PlaceIndex) - .def("get_device_id", - [](const platform::CustomPlace &self) { return self.GetDeviceId(); }) - .def("get_device_type", - [](const platform::CustomPlace &self) { - return self.GetDeviceType(); - }) - .def("__repr__", string::to_string) - .def("__str__", string::to_string); - py::class_ cudaplace(m, "CUDAPlace", R"DOC( - - CUDAPlace is a descriptor of a device. - It represents a GPU device allocated or to be allocated with Tensor or LoDTensor. - Each CUDAPlace has a dev_id to indicate the graphics card ID represented by the current CUDAPlace, - staring from 0. - The memory of CUDAPlace with different dev_id is not accessible. - Numbering here refers to the logical ID of the visible graphics card, not the actual ID of the graphics card. - You can set visible GPU devices by setting the `CUDA_VISIBLE_DEVICES` environment variable. - When the program starts, visible GPU devices will be numbered from 0. - If `CUDA_VISIBLE_DEVICES` is not set, all devices are visible by default, - and the logical ID is the same as the actual ID. - - Parameters: - id (int): GPU device ID. - - Examples: - .. code-block:: python - - import paddle - - place = paddle.CUDAPlace(0) - - )DOC"); - g_cudaplace_pytype = reinterpret_cast(cudaplace.ptr()); - cudaplace - .def("__init__", - [](platform::CUDAPlace &self, int dev_id) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid CUDAPlace(%d), device id must be 0 or " - "positive integer", - dev_id); - std::exit(-1); - } - - if (UNLIKELY(dev_id >= platform::GetGPUDeviceCount())) { - if (platform::GetGPUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use GPU because there is no GPU " - "detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid CUDAPlace(%d), must inside [0, %d), because GPU " - "number on your machine is %d", - dev_id, - platform::GetGPUDeviceCount(), - platform::GetGPUDeviceCount()); - std::exit(-1); - } - } - - new (&self) platform::CUDAPlace(dev_id); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use GPU because you have installed CPU version " - "PaddlePaddle.\n" - "If you want to use GPU, please try to install GPU version " - "PaddlePaddle by: pip install paddlepaddle-gpu\n" - "If you only have CPU, please change CUDAPlace(%d) to be " - "CPUPlace().\n", - dev_id); - std::exit(-1); -#endif - }) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - .def("get_device_id", - [](const platform::CUDAPlace &self) { return self.GetDeviceId(); }) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_get_device_id", - [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); }) -#endif - .def("__repr__", string::to_string) - .def("__str__", string::to_string); - - py::class_ xpuplace(m, "XPUPlace", R"DOC( - **Note**: - Examples: - .. code-block:: python - import paddle.fluid as fluid - xpu_place = fluid.XPUPlace(0) - )DOC"); - g_xpuplace_pytype = reinterpret_cast(xpuplace.ptr()); - xpuplace - .def("__init__", - [](platform::XPUPlace &self, int dev_id) { -#ifdef PADDLE_WITH_XPU - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid XPUPlace(%d), device id must be 0 or " - "positive integer", - dev_id); - std::exit(-1); - } - if (UNLIKELY(dev_id >= platform::GetXPUDeviceCount())) { - if (platform::GetXPUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use XPU because there is no XPU " - "detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid XPUPlace(%d), must inside [0, %d), because XPU " - "number on your machine is %d", - dev_id, - platform::GetXPUDeviceCount(), - platform::GetXPUDeviceCount()); - std::exit(-1); - } - } - new (&self) platform::XPUPlace(dev_id); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use XPU because you have installed CPU/GPU version " - "PaddlePaddle.\n" - "If you want to use XPU, please try to install XPU version " - "PaddlePaddle by: pip install paddlepaddle-xpu\n" - "If you only have CPU, please change XPUPlace(%d) to be " - "CPUPlace().\n", - dev_id); - std::exit(-1); -#endif - }) -#ifdef PADDLE_WITH_XPU - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("get_device_id", - [](const platform::XPUPlace &self) { return self.GetDeviceId(); }) -#endif - .def("__repr__", string::to_string) - .def("__str__", string::to_string); -#ifdef PADDLE_WITH_XPU - py::enum_(m, "XPUVersion", py::arithmetic()) - .value("XPU1", phi::backends::xpu::XPUVersion::XPU1) - .value("XPU2", phi::backends::xpu::XPUVersion::XPU2) - .export_values(); - m.def("get_xpu_device_count", platform::GetXPUDeviceCount); - m.def("get_xpu_device_version", - [](int device_id) { return platform::get_xpu_version(device_id); }); -#ifdef PADDLE_WITH_XPU_KP - m.def("get_xpu_device_op_support_types", - [](const std::string &op_name, phi::backends::xpu::XPUVersion version) { - return platform::get_xpu_kp_op_support_type(op_name, version); - }); -#else - m.def("get_xpu_device_op_support_types", - [](const std::string &op_name, phi::backends::xpu::XPUVersion version) { - return platform::get_xpu_op_support_type(op_name, version); - }); -#endif - m.def("get_xpu_device_op_list", [](phi::backends::xpu::XPUVersion version) { - return platform::get_xpu_op_list(version); - }); - m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool { - // XPUs with Compute Capability > xpu2 support float16 and bfloat16 - return platform::get_xpu_version(place.device) > - phi::backends::xpu::XPUVersion::XPU1; - }); - m.def("is_bfloat16_supported", [](const platform::XPUPlace &place) -> bool { - // XPUs with Compute Capability > xpu2 support float16 and bfloat16 - return platform::get_xpu_version(place.device) > - phi::backends::xpu::XPUVersion::XPU1; - }); -#endif - - py::class_ cpuplace(m, "CPUPlace", R"DOC( - CPUPlace is a descriptor of a device. - It represents a CPU device on which a tensor will be allocated and a model will run. - - Examples: - .. code-block:: python - - import paddle - cpu_place = paddle.CPUPlace() - - )DOC"); - g_cpuplace_pytype = reinterpret_cast(cpuplace.ptr()); - cpuplace.def(py::init<>()) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("__repr__", string::to_string) - .def("__str__", string::to_string); - - py::class_ cudapinnedplace( - m, "CUDAPinnedPlace", R"DOC( - CUDAPinnedPlace is a descriptor of a device. - It refers to the page locked memory allocated by the CUDA function `cudaHostAlloc()` in the host memory. - The host operating system will not paging and exchanging the memory. - It can be accessed through direct memory access technology to speed up the copy of data between the host and GPU. - For more information on CUDA data transfer and `pinned memory`, - please refer to `official document `_ . - - Examples: - .. code-block:: python - - import paddle - place = paddle.CUDAPinnedPlace() - - )DOC"); - g_cudapinnedplace_pytype = - reinterpret_cast(cudapinnedplace.ptr()); - cudapinnedplace - .def("__init__", - [](platform::CUDAPinnedPlace &self) { -#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) - PADDLE_THROW(platform::errors::PermissionDenied( - "Cannot use CUDAPinnedPlace in CPU only version, " - "Please recompile or reinstall Paddle with CUDA support.")); -#endif - new (&self) platform::CUDAPinnedPlace(); - }) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("__repr__", string::to_string) - .def("__str__", string::to_string); - - // NPUPlace - py::class_ npuplace(m, "NPUPlace", R"DOC( - NPUPlace is a descriptor of a device. - It represents a NPU device on which a tensor will be allocated and a model will run. - - Examples: - .. code-block:: python - import paddle - npu_place = paddle.NPUPlace(0) - - )DOC"); - g_npuplace_pytype = reinterpret_cast(npuplace.ptr()); - npuplace - .def("__init__", - [](platform::NPUPlace &self, int dev_id) { -#ifdef PADDLE_WITH_ASCEND_CL - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid NPUPlace(%d), device id must be 0 or " - "positive integer", - dev_id); - std::exit(-1); - } - if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) { - if (platform::GetNPUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use NPU because there is no NPU " - "detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid NPUPlace(%d), must inside [0, %d), because NPU " - "number on your machine is %d", - dev_id, - platform::GetNPUDeviceCount(), - platform::GetNPUDeviceCount()); - std::exit(-1); - } - } - new (&self) platform::NPUPlace(dev_id); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use NPU because you have installed CPU/GPU version " - "PaddlePaddle.\n" - "If you want to use NPU, please try to install NPU version " - "PaddlePaddle by: pip install paddlepaddle-npu\n" - "If you only have CPU, please change NPUPlace(%d) to be " - "CPUPlace().\n", - dev_id); - std::exit(-1); -#endif - }) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("get_device_id", - [](const platform::NPUPlace &self) { return self.GetDeviceId(); }) - .def("__str__", string::to_string); - - // IPUPlace - py::class_(m, "IPUPlace", R"DOC( - IPUPlace is a descriptor of a device. - It represents a IPU device on which a tensor will be allocated and a model will run. - - Examples: - .. code-block:: python - import paddle - - # required: ipu - - ipu_place = paddle.IPUPlace() - - )DOC") - .def("__init__", - [](platform::IPUPlace &self) { -#ifdef PADDLE_WITH_IPU - if (platform::GetIPUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use IPU because there is no IPU " - "detected on your " - "machine."; - std::exit(-1); - } - // use ipu(0) to comile, while run with the number user configure - // in sharding and pipline. - new (&self) platform::IPUPlace(0); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use IPU because you didn't install IPU version " - "PaddlePaddle.\n" - "If you want to use IPU, please try to install IPU version " - "PaddlePaddle by: pip install paddlepaddle*\n" - "If you only have CPU, please change IPUPlace to be " - "CPUPlace().\n"); - std::exit(-1); -#endif - }) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) -#ifdef PADDLE_WITH_IPU - .def("get_device_id", - [](const platform::IPUPlace &self) { return self.GetDeviceId(); }) -#endif - .def("__str__", string::to_string); - - // MLUPlace - py::class_ mluplace(m, "MLUPlace", R"DOC( - MLUPlace is a descriptor of a device. - It represents a MLU device on which a tensor will be allocated and a model will run. - - Examples: - .. code-block:: python - import paddle - # required: mlu - mlu_place = paddle.MLUPlace(0) - - )DOC"); - g_mluplace_pytype = reinterpret_cast(mluplace.ptr()); - mluplace - .def("__init__", - [](platform::MLUPlace &self, int dev_id) { -#ifdef PADDLE_WITH_MLU - if (UNLIKELY(dev_id < 0)) { - LOG(ERROR) << string::Sprintf( - "Invalid MLUPlace(%d), device id must be 0 or " - "positive integer", - dev_id); - std::exit(-1); - } - if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) { - if (platform::GetMLUDeviceCount() == 0) { - LOG(ERROR) << "Cannot use MLU because there is no MLU " - "detected on your " - "machine."; - std::exit(-1); - } else { - LOG(ERROR) << string::Sprintf( - "Invalid MLUPlace(%d), must inside [0, %d), because MLU " - "number on your machine is %d", - dev_id, - platform::GetMLUDeviceCount(), - platform::GetMLUDeviceCount()); - std::exit(-1); - } - } - new (&self) platform::MLUPlace(dev_id); -#else - LOG(ERROR) << string::Sprintf( - "Cannot use MLU because you have installed CPU/GPU/... " - "version " - "PaddlePaddle.\n" - "If you want to use MLU, please try to install MLU version " - "PaddlePaddle by: pip install paddlepaddle-mlu\n" - "If you only have CPU, please change MLUPlace(%d) to be " - "CPUPlace().\n", - dev_id); - std::exit(-1); -#endif - }) - .def("_type", &PlaceIndex) -#ifdef PADDLE_WITH_MLU - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", - &IsSamePlace) - .def("get_device_id", - [](const platform::MLUPlace &self) { return self.GetDeviceId(); }) -#endif - .def("__str__", string::to_string); - - py::class_ platformplace(m, "Place"); - g_place_pytype = reinterpret_cast(platformplace.ptr()); - platformplace.def(py::init<>()) - .def("_type", &PlaceIndex) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("_equals", &IsSamePlace) - .def("is_gpu_place", - [](platform::Place &self) { return platform::is_gpu_place(self); }) - .def("is_cpu_place", - [](platform::Place &self) { return platform::is_cpu_place(self); }) - .def("is_xpu_place", - [](platform::Place &self) { return platform::is_xpu_place(self); }) - .def("is_npu_place", - [](platform::Place &self) { return platform::is_npu_place(self); }) - .def("is_ipu_place", - [](platform::Place &self) { return platform::is_ipu_place(self); }) - .def("is_cuda_pinned_place", - [](platform::Place &self) { - return platform::is_cuda_pinned_place(self); - }) - .def("is_mlu_place", - [](platform::Place &self) { return platform::is_mlu_place(self); }) - .def( - "is_custom_place", - [](platform::Place &self) { return platform::is_custom_place(self); }) - .def("gpu_device_id", [](platform::Place &self) { return self.device; }) - .def("xpu_device_id", [](platform::Place &self) { return self.device; }) - .def("npu_device_id", [](platform::Place &self) { return self.device; }) - .def("ipu_device_id", [](platform::Place &self) { return self.device; }) - .def("mlu_device_id", [](platform::Place &self) { return self.device; }) - .def("custom_device_id", - [](platform::Place &self) { return self.device; }) - .def("set_place", - [](platform::Place &self, const platform::Place &other) { - self = other; - }) - .def("set_place", - [](platform::Place &self, const platform::CPUPlace &cpu_place) { - self = cpu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::XPUPlace &xpu_place) { - self = xpu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::CUDAPlace &gpu_place) { - self = gpu_place; - }) - .def("set_place", - [](platform::Place &self, - const platform::CUDAPinnedPlace &cuda_pinned_place) { - self = cuda_pinned_place; - }) - .def("set_place", - [](platform::Place &self, const platform::NPUPlace &npu_place) { - self = npu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::IPUPlace &ipu_place) { - self = ipu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::MLUPlace &mlu_place) { - self = mlu_place; - }) - .def("set_place", - [](platform::Place &self, const platform::CustomPlace &plug_place) { - self = plug_place; - }) - .def("__repr__", string::to_string) - .def("__str__", string::to_string); py::class_(m, "Operator") .def_static("create", @@ -3661,927 +2144,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("clear_executor_cache", []() { framework::ExecutorInfoCache::Instance().Finalize(); }); - using VarQuantScale = - std::unordered_map>; - - py::class_> pass(m, "Pass"); - pass.def(py::init()) - .def("has", &ir::Pass::Has) - .def("set_not_owned", - [](ir::Pass &self, const std::string &attr_name, ProgramDesc &attr) { - self.SetNotOwned(attr_name, &attr); - }) - .def( - "set", - [](ir::Pass &self, const std::string &name, const std::string &attr) { - self.Set(name, new std::string(attr)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, bool val) { - self.Set(name, new bool(val)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, int val) { - self.Set(name, new int(val)); - }) - .def("set", - [](ir::Pass &self, - const std::string &name, - std::vector set) { - self.Set(name, new std::vector(set)); - }) - .def("set", - [](ir::Pass &self, - const std::string &name, - std::unordered_set set) { - self.Set(name, new std::unordered_set(set)); - }) - .def("set", - [](ir::Pass &self, - const std::string &name, - std::unordered_set set) { - self.Set(name, new std::unordered_set(set)); - }) - .def("set", - [](ir::Pass &self, const std::string &name, VarQuantScale scales) { - self.Set(name, new VarQuantScale(scales)); - }) - .def("type", &ir::Pass::Type) - .def("apply", [](ir::Pass &self, std::shared_ptr graph) { - self.Apply(graph.get()); - }); - - py::class_> pb( - m, "PassBuilder"); - pb.def(py::init()) - .def("append_pass", - [](ir::PassBuilder &self, - const std::string &pass_type) -> std::shared_ptr { - return self.AppendPass(pass_type); - }) - .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); }) - .def("insert_pass", - [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) { - return self.InsertPass(idx, pass_type); - }) - .def("remove_pass", - [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); - - // -- python binds for parallel executor. - py::class_ pe(m, "ParallelExecutor"); - py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( - ExecutionStrategy allows the user to more preciously control how to run - the program in ParallelExecutor by setting the property. - - Returns: - ExecutionStrategy: An ExecutionStrategy object. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - import paddle.nn.functional as F - - paddle.enable_static() - - x = static.data(name='x', shape=[None, 13], dtype='float32') - y = static.data(name='y', shape=[None, 1], dtype='float32') - y_predict = static.nn.fc(input=x, size=1, act=None) - - cost = F.square_error_cost(input=y_predict, label=y) - avg_loss = paddle.mean(cost) - - sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) - sgd_optimizer.minimize(avg_loss) - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_threads = 4 - - train_exe = static.ParallelExecutor(use_cuda=False, - loss_name=avg_loss.name, - exec_strategy=exec_strategy) - )DOC"); - - py::enum_(m, "DeviceType", py::arithmetic()) - .value("CPU", paddle::platform::DeviceType::CPU) - .value("CUDA", paddle::platform::DeviceType::CUDA) - .value("XPU", paddle::platform::DeviceType::XPU); - - exec_strategy.def(py::init()) - .def_property( - "num_threads", - [](const ExecutionStrategy &self) { return self.num_threads_; }, - [](ExecutionStrategy &self, size_t num_threads) { - self.num_threads_ = num_threads; - }, - R"DOC( - The type is INT, num_threads represents the size of thread pool that - used to run the operators of the current program in ParallelExecutor. - If :math:`num\_threads=1`, all the operators will execute one by one, - but the order maybe difference between iterations. - If it is not set, it will be set in ParallelExecutor according to the - device type and device count, for GPU, :math:`num\_threads=device\_count*4`, for CPU, - :math:`num\_threads=CPU\_NUM*4`, the explanation of:math:`CPU\_NUM` is in ParallelExecutor. - if it is not set, ParallelExecutor will get the cpu count by calling - `multiprocessing.cpu_count()`. Default 0. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_threads = 4 - )DOC") - .def_property( - "_use_device", - [](const ExecutionStrategy &self) { return self.use_device_; }, - [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) { - self.use_device_ = use_device; - }) // NOTE(liuyuhui): Doesn't add doc for 'use_device', because - // use_device isn‘t exposed to users. - .def_property( - "allow_op_delay", - [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, - [](ExecutionStrategy &self, bool allow_op_delay) { - self.allow_op_delay_ = allow_op_delay; - }, - R"DOC(The type is BOOL, allow_op_delay represents whether to delay the - communication operators to run, it may make the execution faster. - Note that this option is invalid now, and it will be removed in - next version. Default False.)DOC") - .def_property( - "num_iteration_per_drop_scope", - [](const ExecutionStrategy &self) { - return self.num_iteration_per_drop_scope_; - }, - [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) { - self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope; - }, - R"DOC(The type is INT, num_iteration_per_drop_scope indicates how - many iterations to clean up the temp variables which - is generated during execution. It may make the execution faster, - because the temp variable's shape maybe the same between two iterations. - Default 100. - - .. note:: - 1. If you fetch data when calling the 'run', the ParallelExecutor - will clean up the temp variables at the end of the current iteration. - 2. In some NLP model, it may cause the GPU memory is insufficient, - in this case, you should reduce `num_iteration_per_drop_scope`. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_iteration_per_drop_scope = 10 - )DOC") - .def_property( - "num_iteration_per_run", - [](const ExecutionStrategy &self) { - return self.num_iteration_per_run_; - }, - [](ExecutionStrategy &self, size_t num_iteration_per_run) { - self.num_iteration_per_run_ = num_iteration_per_run; - }, - R"DOC(This config that how many iteration the executor will run when - user call exe.run() in python。Default: 1. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - exec_strategy = static.ExecutionStrategy() - exec_strategy.num_iteration_per_run = 10 - )DOC") - .def_property( - "use_thread_barrier", - [](const ExecutionStrategy &self) { return self.thread_barrier_; }, - [](ExecutionStrategy &self, bool use_thread_barrier) { - self.thread_barrier_ = use_thread_barrier; - }, - R"DOC(This config that the this is distributed training with parameter server - )DOC") - .def_property( - "_dry_run", - [](const ExecutionStrategy &self) { return self.dry_run_; }, - [](ExecutionStrategy &self, bool dry_run) { - self.dry_run_ = dry_run; - }); - - exec_strategy.def_property( - "use_experimental_executor", - [](const ExecutionStrategy &self) { - return self.type_ == ExecutionStrategy::kExperimental; - }, - [](ExecutionStrategy &self, bool experimental) { - self.type_ = experimental ? ExecutionStrategy::kExperimental - : ExecutionStrategy::kDefault; - }); - - py::class_ build_strategy(pe, "BuildStrategy", R"DOC( - BuildStrategy allows the user to more preciously control how to - build the SSA Graph in ParallelExecutor by setting the property. - - Returns: - BuildStrategy: An BuildStrategy object. - - Examples: - .. code-block:: python - - import os - import paddle - import paddle.static as static - - paddle.enable_static() - - os.environ['CPU_NUM'] = str(2) - places = static.cpu_places() - - data = static.data(name="x", shape=[None, 1], dtype="float32") - hidden = static.nn.fc(input=data, size=10) - loss = paddle.mean(hidden) - paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) - - build_strategy = static.BuildStrategy() - build_strategy.enable_inplace = True - build_strategy.memory_optimize = True - build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce - program = static.CompiledProgram(static.default_main_program()) - program = program.with_data_parallel(loss_name=loss.name, - build_strategy=build_strategy, - places=places) -)DOC"); - - py::enum_(build_strategy, "ReduceStrategy") - .value("Reduce", BuildStrategy::ReduceStrategy::kReduce) - .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce) - .value("_NoReduce", BuildStrategy::ReduceStrategy::kNoReduce); - py::enum_(build_strategy, - "GradientScaleStrategy") - .value("CoeffNumDevice", - BuildStrategy::GradientScaleStrategy::kCoeffNumDevice) - .value("One", BuildStrategy::GradientScaleStrategy::kOne) - .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized); - - build_strategy.def(py::init()) - .def("_clear_finalized", &BuildStrategy::ClearFinalized) - .def_property( - "reduce_strategy", - [](const BuildStrategy &self) { return self.reduce_; }, - [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.reduce_ = strategy; - }, - R"DOC((fluid.BuildStrategy.ReduceStrategy, optional): there are two reduce - strategies in ParallelExecutor, AllReduce and Reduce. If you want - that all the parameters' optimization are done on all devices independently, - you should choose AllReduce; otherwise, if you choose Reduce, all the parameters' - optimization will be evenly distributed to different devices, and then - broadcast the optimized parameter to other devices. - Default is 'AllReduce'. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.reduce_strategy = static.BuildStrategy.ReduceStrategy.Reduce - )DOC") - .def_property( - "gradient_scale_strategy", - [](const BuildStrategy &self) { return self.gradient_scale_; }, - [](BuildStrategy &self, - BuildStrategy::GradientScaleStrategy strategy) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.gradient_scale_ = strategy; - }, - R"DOC((paddle.static.BuildStrategy.GradientScaleStrategy, optional): there are three - ways of defining :math:`loss@grad` in ParallelExecutor, that is, CoeffNumDevice, - One and Customized. By default, ParallelExecutor sets the :math:`loss@grad` - according to the number of devices. If you want to customize :math:`loss@grad`, - you can choose Customized. Default is 'CoeffNumDevice'. - - Examples: - .. code-block:: python - - import numpy - import os - import paddle - import paddle.static as static - - paddle.enable_static() - - use_cuda = True - place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() - exe = static.Executor(place) - - # NOTE: If you use CPU to run the program, you need - # to specify the CPU_NUM, otherwise, paddle will use - # all the number of the logic core as the CPU_NUM, - # in that case, the batch size of the input should be - # greater than CPU_NUM, if not, the process will be - # failed by an exception. - if not use_cuda: - os.environ['CPU_NUM'] = str(2) - places = static.cpu_places() - else: - places = static.cuda_places() - - data = static.data(name='X', shape=[None, 1], dtype='float32') - hidden = static.nn.fc(input=data, size=10) - loss = paddle.mean(hidden) - paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) - - exe.run(static.default_startup_program()) - - build_strategy = static.BuildStrategy() - build_strategy.gradient_scale_strategy = \ - static.BuildStrategy.GradientScaleStrategy.Customized - compiled_prog = static.CompiledProgram( - static.default_main_program()).with_data_parallel( - loss_name=loss.name, build_strategy=build_strategy, - places=places) - - dev_count = len(places) - x = numpy.random.random(size=(10, 1)).astype('float32') - loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01 - loss_grad_name = loss.name+"@GRAD" - loss_data = exe.run(compiled_prog, - feed={"X": x, loss_grad_name : loss_grad}, - fetch_list=[loss.name, loss_grad_name]) - )DOC") - .def_property( - "debug_graphviz_path", - [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, - [](BuildStrategy &self, const std::string &path) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.debug_graphviz_path_ = path; - }, - R"DOC((str, optional): debug_graphviz_path indicates the path that - writing the SSA Graph to file in the form of graphviz. - It is useful for debugging. Default is empty string, that is, "" - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.debug_graphviz_path = "./graph" - )DOC") - .def_property( - "enable_sequential_execution", - [](const BuildStrategy &self) { - return self.enable_sequential_execution_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.enable_sequential_execution_ = b; - }, - R"DOC((bool, optional): If set True, the execution order of ops would - be the same as what is in the program. Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.enable_sequential_execution = True - )DOC") - .def_property( - "remove_unnecessary_lock", - [](const BuildStrategy &self) { - return self.remove_unnecessary_lock_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.remove_unnecessary_lock_ = b; - }, - R"DOC((bool, optional): If set True, some locks in GPU ops would be - released and ParallelExecutor would run faster. Default is True. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.remove_unnecessary_lock = True - )DOC") - .def_property( - "num_trainers", - [](const BuildStrategy &self) { return self.num_trainers_; }, - [](BuildStrategy &self, int num_trainers) { -#ifdef WIN32 - PADDLE_THROW(platform::errors::Unavailable( - "Distribution mode is not supported on Windows platform.")); -#endif - self.num_trainers_ = num_trainers; - }) - .def_property( - "trainers_endpoints", - [](const BuildStrategy &self) { return self.trainers_endpoints_; }, - [](BuildStrategy &self, - const std::vector &trainers_endpoints) { - self.trainers_endpoints_ = trainers_endpoints; - }) - .def_property( - "trainer_id", - [](const BuildStrategy &self) { return self.trainer_id_; }, - [](BuildStrategy &self, int trainer_id) { - self.trainer_id_ = trainer_id; - }) - .def_property( - "nccl_comm_num", - [](const BuildStrategy &self) { return self.nccl_comm_num_; }, - [](BuildStrategy &self, int nccl_comm_num) { - self.nccl_comm_num_ = nccl_comm_num; - }) - .def_property( - "bkcl_comm_num", - [](const BuildStrategy &self) { return self.bkcl_comm_num_; }, - [](BuildStrategy &self, int bkcl_comm_num) { - self.bkcl_comm_num_ = bkcl_comm_num; - }) - .def_property( - "use_hierarchical_allreduce", - [](const BuildStrategy &self) { - return self.use_hierarchical_allreduce_; - }, - [](BuildStrategy &self, bool use) { - self.use_hierarchical_allreduce_ = use; - }) - .def_property( - "hierarchical_allreduce_inter_nranks", - [](const BuildStrategy &self) { - return self.hierarchical_allreduce_inter_nranks_; - }, - [](BuildStrategy &self, int nranks) { - self.hierarchical_allreduce_inter_nranks_ = nranks; - }) - - .def_property( - "fuse_elewise_add_act_ops", - [](const BuildStrategy &self) { - return self.fuse_elewise_add_act_ops_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_elewise_add_act_ops_ = b; - }, - R"DOC((bool, optional): fuse_elewise_add_act_ops indicate whether - to fuse elementwise_add_op and activation_op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_elewise_add_act_ops = True - )DOC") - .def_property( - "fuse_gemm_epilogue", - [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_gemm_epilogue_ = b; - }, - R"DOC((bool, optional): fuse_gemm_epilogue indicate whether - to fuse matmul_op, elemenewist_add_op and activation_op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_gemm_epilogue = True - )DOC") - .def_property( - "fuse_bn_act_ops", - [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_bn_act_ops_ = b; - }, - R"DOC((bool, optional): fuse_bn_act_ops indicate whether - to fuse batch_norm and activation_op, - it may make the execution faster. Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_bn_act_ops = True - )DOC") - .def_property( - "fuse_bn_add_act_ops", - [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_bn_add_act_ops_ = b; - }, - R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether - to fuse batch_norm, elementwise_add and activation_op, - it may make the execution faster. Default is True - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_bn_add_act_ops = True - )DOC") - .def_property( - "enable_auto_fusion", - [](const BuildStrategy &self) { return self.enable_auto_fusion_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.enable_auto_fusion_ = b; - }, - R"DOC((bool, optional): Whether to enable fusing subgraph to a - fusion_group. Now we only support fusing subgraph that composed - of elementwise-like operators, such as elementwise_add/mul - without broadcast and activations. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.enable_auto_fusion = True - )DOC") - .def_property( - "fuse_relu_depthwise_conv", - [](const BuildStrategy &self) { - return self.fuse_relu_depthwise_conv_; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.fuse_relu_depthwise_conv_ = b; - }, - R"DOC((bool, optional): fuse_relu_depthwise_conv indicate whether - to fuse relu and depthwise_conv2d, - it will save GPU memory and may make the execution faster. - This options is only available in GPU devices. - Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_relu_depthwise_conv = True - )DOC") - .def_property( - "fuse_broadcast_ops", - [](const BuildStrategy &self) { - return self.fuse_broadcast_ops_ == true || - self.fuse_broadcast_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, " - "cannot be configured again.")); - self.fuse_broadcast_ops_ = b; - }, - R"DOC((bool, optional): fuse_broadcast_op indicates whether - to fuse the broadcast ops. Note that, in Reduce mode, - fusing broadcast ops may make the program faster. Because - fusing broadcast OP equals delaying the execution of all - broadcast Ops, in this case, all nccl streams are used only - for NCCLReduce operations for a period of time. Default False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.fuse_broadcast_ops = True - )DOC") - .def_property( - "fuse_all_optimizer_ops", - [](const BuildStrategy &self) { - return self.fuse_all_optimizer_ops_ == true || - self.fuse_all_optimizer_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, " - "cannot be configured again.")); - self.fuse_all_optimizer_ops_ = b; - }) - .def_property( - "sync_batch_norm", - [](const BuildStrategy &self) { return self.sync_batch_norm_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE_NE(self.IsFinalized(), - true, - platform::errors::PreconditionNotMet( - "BuildStrategy has been finlaized, cannot be " - "configured again.")); - self.sync_batch_norm_ = b; - }, - R"DOC((bool, optional): sync_batch_norm indicates whether to use - synchronous batch normalization which synchronizes the mean - and variance through multi-devices in training phase. - Current implementation doesn't support FP16 training and CPU. - And only synchronous on one machine, not all machines. - Default is False. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.sync_batch_norm = True - )DOC") - .def_property( - "memory_optimize", - [](const BuildStrategy &self) -> py::object { - if (self.memory_optimize_) { - return py::cast(self.memory_optimize_.get()); - } else { - return py::cast(nullptr); - } - }, - [](BuildStrategy &self, const py::handle &value) { - auto *py_obj = value.ptr(); - if (py_obj == nullptr || py_obj == Py_None) { - self.memory_optimize_ = paddle::none; - } else if (PyBool_Check(py_obj)) { - self.memory_optimize_ = (py_obj == Py_True); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "BuildStrategy.memory_optimize must be set to None, False " - "or True")); - } - }, - R"DOC((bool, optional): memory opitimize aims to save total memory - consumption, set to True to enable it. - - Default None. None means framework would choose to use or not use - this strategy automatically. Currently, None means that it is - enabled when GC is disabled, and disabled when GC is enabled. - True means enabling and False means disabling. Default is None. - - Examples: - .. code-block:: python - - import paddle - import paddle.static as static - - paddle.enable_static() - - build_strategy = static.BuildStrategy() - build_strategy.memory_optimize = True - - )DOC") - .def_property( - "is_distribution", - [](const BuildStrategy &self) { return self.is_distribution_; }, - [](BuildStrategy &self, bool b) { -#ifdef WIN32 - if (b) { - PADDLE_THROW(platform::errors::Unavailable( - "Distribution mode is not supported on Windows platform.")); - } -#else - self.is_distribution_ = b; -#endif - }) - .def_property( - "async_mode", - [](const BuildStrategy &self) { return self.async_mode_; }, - [](BuildStrategy &self, bool b) { self.async_mode_ = b; }) - .def_property( - "enable_inplace", - [](const BuildStrategy &self) { return self.enable_inplace_; }, - [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) - .def_property( - "enable_addto", - [](const BuildStrategy &self) { return self.enable_addto_; }, - [](BuildStrategy &self, bool b) { self.enable_addto_ = b; }) - .def_property( - "fuse_all_reduce_ops", - [](const BuildStrategy &self) { - return self.fuse_all_reduce_ops_ == true || - self.fuse_all_reduce_ops_ == paddle::none; - }, - [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; }) - .def_property( - "enable_backward_optimizer_op_deps", - [](const BuildStrategy &self) { - return self.enable_backward_optimizer_op_deps_; - }, - [](BuildStrategy &self, bool b) { - self.enable_backward_optimizer_op_deps_ = b; - }) - .def_property( - "cache_runtime_context", - [](const BuildStrategy &self) { return self.cache_runtime_context_; }, - [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; }) - .def_property( - "mkldnn_enabled_op_types", - [](const BuildStrategy &self) { - return self.mkldnn_enabled_op_types_; - }, - [](BuildStrategy &self, - const std::unordered_set &mkldnn_enabled_op_types) { - self.mkldnn_enabled_op_types_ = mkldnn_enabled_op_types; - }) - .def_property( - "fix_op_run_order", - [](const BuildStrategy &self) { return self.fix_op_run_order_; }, - [](BuildStrategy &self, bool fix_op_run_order) { - self.fix_op_run_order_ = fix_op_run_order; - }) - .def_property( - "allow_cuda_graph_capture", - [](const BuildStrategy &self) { - return self.allow_cuda_graph_capture_; - }, - [](BuildStrategy &self, bool allow_cuda_graph_capture) { - self.allow_cuda_graph_capture_ = allow_cuda_graph_capture; - }) - .def("_copy", - [](const BuildStrategy &self) { - auto new_bs = self; - new_bs.ClearFinalized(); - return new_bs; - }) - .def( - "_finalize_strategy_and_create_passes", - [](BuildStrategy &self) -> std::shared_ptr { - return self.CreatePassesFromStrategy(true); - }, - R"DOC(Allow user to customized passes. Normally model-specific - optimization passes should be defined in this way. BuildStrategy - cannot be updated after being finalized.)DOC"); - - m.def("_set_cached_executor_build_strategy", - [](int64_t program_id, const BuildStrategy &build_strategy) { - auto &cached_exe_info = framework::ExecutorInfoCache::Instance(); - cached_exe_info.SetBuildStrategy(program_id, build_strategy); - }); - - pe.def(py::init &, - const std::vector &, - const std::string &, - Scope *, - std::vector &, - const ExecutionStrategy &, - const BuildStrategy &, - ir::Graph *>()) - // NOTE: even we return a vec* to Python use reference policy. - // We still cannot get local_scope from this vector, since the element - // of vec will be freed by Python GC. We can only return Scope* - // one by one and mark them as reference. - .def( - "local_scopes", - [](ParallelExecutor &self) -> std::vector * { - return &self.GetLocalScopes(); - }, - py::return_value_policy::reference) - .def("drop_local_exe_scopes", &ParallelExecutor::DropLocalExeScopes) - .def("_need_create_local_exe_scopes", - &ParallelExecutor::NeedCreateLocalExeScope) - .def("feed_tensors_into_local_scopes", - &ParallelExecutor::FeedTensorsIntoLocalScopes) - .def("feed_and_split_tensor_into_local_scopes", - &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes) - .def("run", - [](ParallelExecutor &self, - const std::vector &fetch_tensors, - bool return_merged) -> py::object { - if (return_merged) { - paddle::framework::FetchList ret; - /*gil_scoped_release*/ { - pybind11::gil_scoped_release release; - ret = self.RunAndMerge(fetch_tensors); - } - return py::cast(std::move(ret)); - } else { - paddle::framework::FetchUnmergedList ret; - /*gil_scoped_release*/ { - pybind11::gil_scoped_release release; - ret = self.Run(fetch_tensors); - } - return py::cast(std::move(ret)); - } - }) - .def("device_count", &ParallelExecutor::DeviceCount); - #ifdef PADDLE_WITH_IPU py::class_>( @@ -4790,6 +2352,9 @@ All parameter, weight, gradient are variables in Paddle. BindFleetWrapper(&m); BindIO(&m); + BindParallelExecutor(m); + BindPlace(m); + BindTensor(m); #if defined(PADDLE_WITH_PSLIB) && !defined(PADDLE_WITH_HETERPS) BindHeterWrapper(&m); diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc new file mode 100644 index 0000000000000..6ee72e0c1630b --- /dev/null +++ b/paddle/fluid/pybind/tensor.cc @@ -0,0 +1,1106 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include +#include +#include +#include +#include +#include +#include // NOLINT // for call_once +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/executor_cache.h" +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" +#include "paddle/fluid/framework/ir/cost_model.h" +#include "paddle/fluid/framework/ir/generate_pass.h" +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/new_executor/executor_statistics.h" +#include "paddle/fluid/framework/new_executor/standalone_executor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/framework/phi_utils.h" +#include "paddle/fluid/framework/prune.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/save_load_util.h" +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/selected_rows_utils.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/trainer.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/memory/allocation/allocator_strategy.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#endif +#include "paddle/fluid/memory/allocation/mmap_allocator.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/operators/py_func_op.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_python.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/profiler.h" +#include "paddle/fluid/pybind/cuda_streams_py.h" +#include "paddle/fluid/pybind/distributed_py.h" +#include "paddle/fluid/pybind/eager.h" +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/pybind/io.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/lod_utils.h" +#include "paddle/utils/none.h" +#ifdef PADDLE_WITH_ASCEND +#include "paddle/fluid/pybind/ascend_wrapper_py.h" +#endif +#include "paddle/fluid/pybind/bind_cost_model.h" +#include "paddle/fluid/pybind/bind_fleet_executor.h" +#include "paddle/fluid/pybind/box_helper_py.h" +#include "paddle/fluid/pybind/communication.h" +#include "paddle/fluid/pybind/compatible.h" +#include "paddle/fluid/pybind/const_value.h" +#include "paddle/fluid/pybind/data_set_py.h" +#include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/fleet_wrapper_py.h" +#include "paddle/fluid/pybind/generator_py.h" +#include "paddle/fluid/pybind/global_value_getter_setter.h" +#include "paddle/fluid/pybind/gloo_context_py.h" +#include "paddle/fluid/pybind/gloo_wrapper_py.h" +#include "paddle/fluid/pybind/heter_wrapper_py.h" +#include "paddle/fluid/pybind/inference_api.h" +#include "paddle/fluid/pybind/ir.h" +#include "paddle/fluid/pybind/metrics_py.h" +#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" +#include "paddle/phi/backends/device_manager.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/pybind/nccl_wrapper_py.h" +#endif +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/pybind/protobuf.h" +#include "paddle/fluid/pybind/pybind.h" // NOLINT +#include "paddle/fluid/pybind/reader_py.h" +#include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/fluid/string/to_string.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#endif +#ifndef PADDLE_WITH_HIP +#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h" +#endif +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#endif + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#endif + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" +#endif + +#ifdef PADDLE_WITH_CUSTOM_DEVICE +#include "paddle/phi/capi/capi.h" +#endif + +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_info.h" +#endif + +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#endif + +#ifdef PADDLE_WITH_CRYPTO +#include "paddle/fluid/pybind/crypto.h" +#endif + +#if defined PADDLE_WITH_PSCORE +#include "paddle/fluid/pybind/fleet_py.h" +#endif + +#ifdef PADDLE_WITH_CINN +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#endif + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/imperative/layout_autotune.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/fluid/pybind/tensor.h" +#include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/autotune/switch_autotune.h" +#include "pybind11/stl.h" + +DECLARE_bool(use_mkldnn); + +// disable auto conversion to list in Python +PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchUnmergedList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchList); +PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); + +namespace paddle { +namespace pybind { + +PyTypeObject *g_framework_tensor_pytype = nullptr; + +template +static void TensorCopyFrom(framework::Tensor *dst, + const framework::Tensor &src, + const PlaceType &place, + int64_t batch_size) { + if (batch_size < 0) { + framework::TensorCopy(src, place, dst); + } else { + auto sliced = src.Slice(0, batch_size); + framework::TensorCopy(sliced, place, dst); + } +} + +void BindTensor(pybind11::module &m) { // NOLINT + using namespace paddle::framework; // NOLINT + py::class_ framework_tensor( + m, "Tensor", py::buffer_protocol()); + g_framework_tensor_pytype = + reinterpret_cast(framework_tensor.ptr()); + framework_tensor + .def("__array__", + [](framework::Tensor &self) { return TensorToPyArray(self); }) + .def("_ptr", + [](const framework::Tensor &self) { + return reinterpret_cast(self.data()); + }) + .def("_slice", &framework::Tensor::Slice) + .def("_numel", &framework::Tensor::numel) + .def("_is_initialized", + [](const framework::Tensor &self) { return self.IsInitialized(); }) + .def("_get_dims", + [](const framework::Tensor &self) { return vectorize(self.dims()); }) + .def("_set_dims", + [](framework::Tensor &self, const std::vector &dim) { + self.Resize(phi::make_ddim(dim)); + }) + .def("_set_layout", + [](framework::Tensor &self, const std::string &layout) { + self.set_layout(StringToDataLayout(layout)); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::CustomPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::CUDAPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::XPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::NPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::MLUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_double", + [](framework::Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::CustomPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::XPUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::CUDAPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, paddle::platform::MLUPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_int", + [](framework::Tensor &self, + paddle::platform::CUDAPinnedPlace &place) { + self.mutable_data(place); + }) + .def("_alloc_float", + [](framework::Tensor &self, + paddle::platform::CUDAPinnedPlace &place) { + self.mutable_data(place); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::CPUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::CustomPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::XPUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::CUDAPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::CUDAPinnedPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::MLUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_clear", &framework::Tensor::clear) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::NPUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false, + R"DOC( + Set the data of Tensor on place with given numpy array. + + Args: + lod (numpy.ndarray): The data to set. + place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the + Tensor is to be set. + zero_copy (bool, optional): Whether to share memory with the input numpy array. + This parameter only works with CPUPlace. Default: False. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + )DOC") + + .def( + "shape", + [](framework::Tensor &self) { return vectorize(self.dims()); }, + R"DOC( + Return the shape of Tensor. + + Returns: + list[int]: The shape of Tensor. + + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + print(t.shape()) # [5, 30] + )DOC") + .def("_to_dlpack", + [](framework::Tensor &self) { + DLPackTensor dlpack_tensor(self, 1); + DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor(); + auto capsule = py::capsule( + static_cast(dmt), "dltensor", [](PyObject *ptr) { + if (ptr) { + auto dltensor = new DLManagedTensor; + try { + dltensor = reinterpret_cast( + PyCapsule_GetPointer(ptr, "used_dltensor")); + return; + } catch (...) { + dltensor = reinterpret_cast( + PyCapsule_GetPointer(ptr, "dltensor")); + } + dltensor->deleter(dltensor); + } + }); + return capsule; + }) + .def("_set_float_element", TensorSetElement) + .def("_get_float_element", TensorGetElement) + .def("_set_double_element", TensorSetElement) + .def("_get_double_element", TensorGetElement) + .def("_place", [](framework::Tensor &self) { return self.place(); }) + .def("_dtype", + [](framework::Tensor &self) { + return framework::TransToProtoVarType(self.type()); + }) + .def("_layout", + [](framework::Tensor &self) { + return DataLayoutToString(self.layout()); + }) + .def("_share_data_with", &framework::Tensor::ShareDataWith) + .def("__getitem__", PySliceTensor, py::return_value_policy::reference) + .def("__str__", + [](const framework::Tensor &self) { + std::stringstream ostr; + ostr << self; + return ostr.str(); + }) /* ------ End of original Tensor ------ */ + .def("__init__", + [](framework::Tensor &instance, + const std::vector> + &recursive_sequence_lengths) { + LoD new_lod; + new_lod.reserve(recursive_sequence_lengths.size()); + std::copy(recursive_sequence_lengths.begin(), + recursive_sequence_lengths.end(), + std::back_inserter(new_lod)); + LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); + PADDLE_ENFORCE_EQ( + CheckLoD(new_offset_lod, -1), + true, + platform::errors::InvalidArgument( + "The provided recursive_sequence_lengths info is " + "invalid, " + "the LoD converted by recursive_sequence_lengths is %s", + new_lod)); + new (&instance) framework::Tensor(new_offset_lod); + }) + .def("__init__", + [](framework::Tensor &instance) { + new (&instance) framework::Tensor(); + }) + // We implement offset based LOD in C++ while we use length based with + // Python API. So we changed set_lod to set_recursive_sequence_lengths + // to + // avoid misuse. + // The discussion is here: + // https://github.com/PaddlePaddle/Paddle/issues/10855 + .def( + "set_lod", + [](framework::Tensor &self, + const std::vector> &lod) { + // the input lod is offset-based level-of-detail info + LoD new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + PADDLE_ENFORCE_EQ( + CheckLoD(new_lod, vectorize(self.dims()).front()), + true, + platform::errors::InvalidArgument( + "The provided LoD is invalid, the LoD is %s", new_lod)); + self.set_lod(new_lod); + }, + py::arg("lod"), + R"DOC( + Set LoD of the Tensor. + + Args: + lod (list[list[int]]): The lod to set. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set_lod([[0, 2, 5]]) + print(t.lod()) # [[0, 2, 5]] + )DOC") + .def( + "set_recursive_sequence_lengths", + [](framework::Tensor &self, + const std::vector> + &recursive_sequence_lengths) { + // the input recursive_sequence_lengths is length-based + // level-of-detail info + LoD new_lod; + new_lod.reserve(recursive_sequence_lengths.size()); + std::copy(recursive_sequence_lengths.begin(), + recursive_sequence_lengths.end(), + std::back_inserter(new_lod)); + LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); + PADDLE_ENFORCE_EQ( + CheckLoD(new_offset_lod, vectorize(self.dims()).front()), + true, + platform::errors::InvalidArgument( + "The provided recursive_sequence_lengths info is " + "invalid, " + "the LoD converted by recursive_sequence_lengths is " + "%s", + new_lod)); + self.set_lod(new_offset_lod); + }, + py::arg("recursive_sequence_lengths"), + R"DOC( + Set LoD of the Tensor according to recursive sequence lengths. + + For example, if recursive_sequence_lengths=[[2, 3]], which means + there are two sequences with length 2 and 3 respectively, the + corresponding lod would be [[0, 2, 2+3]], i.e., [[0, 2, 5]]. + + Args: + recursive_sequence_lengths (list[list[int]]): The recursive sequence lengths. + + Returns: + None. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set_recursive_sequence_lengths([[2, 3]]) + print(t.recursive_sequence_lengths()) # [[2, 3]] + print(t.lod()) # [[0, 2, 5]] + )DOC") + .def( + "lod", + [](framework::Tensor &self) -> std::vector> { + // output the offset-based lod info + LoD lod = self.lod(); + std::vector> new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + return new_lod; + }, + R"DOC( + Return the LoD of the Tensor. + + Returns: + list[list[int]]: The lod of the Tensor. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set_lod([[0, 2, 5]]) + print(t.lod()) # [[0, 2, 5]] + )DOC") + // Set above comments of set_lod. + .def( + "recursive_sequence_lengths", + [](framework::Tensor &self) -> std::vector> { + // output the length-based lod info + LoD lod = phi::ConvertToLengthBasedLoD(self.lod()); + std::vector> new_lod; + new_lod.reserve(lod.size()); + std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); + return new_lod; + }, + R"DOC( + Return the recursive sequence lengths corresponding to of the LodD + of the Tensor. + + Returns: + list[list[int]]: The recursive sequence lengths. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set_recursive_sequence_lengths([[2, 3]]) + print(t.recursive_sequence_lengths()) # [[2, 3]] + )DOC") + .def( + "has_valid_recursive_sequence_lengths", + [](framework::Tensor &self) -> bool { + // Check that the lod info is valid and match the outermost + // dimension of the Tensor data + return CheckLoD(self.lod(), vectorize(self.dims()).front()); + }, + R"DOC( + Check whether the LoD of the Tensor is valid. + + Returns: + bool: Whether the LoD is valid. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy as np + + t = fluid.Tensor() + t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set_recursive_sequence_lengths([[2, 3]]) + print(t.has_valid_recursive_sequence_lengths()) # True + )DOC") + .def("_as_type", + [](const framework::Tensor &self, + paddle::framework::proto::VarType::Type type) { + framework::Tensor dst; + if (self.IsInitialized() && self.numel() > 0) { + TransDataType(self, type, &dst); + } + return dst; + }) + .def("_copy", + [](const framework::Tensor &self, const platform::Place &place) { + // follow fetch_op's inplementation + framework::Tensor dst; + if (self.IsInitialized() && self.numel() > 0) { + TensorCopySync(self, place, &dst); + } else { + // Not copy, if the src tensor is empty. + dst.clear(); + dst.Resize({0}); + } + dst.set_lod(self.lod()); + return dst; +#ifdef _WIN32 + }); +#else + }) +#ifdef PADDLE_WITH_CUDA + .def("_share_buffer_with", + [](framework::Tensor &self, const framework::Tensor src, + py::tuple t) { + auto *cuda_ipc_allocation = + dynamic_cast( + src.Holder().get()); + + PADDLE_ENFORCE_NOT_NULL( + cuda_ipc_allocation, + platform::errors::PreconditionNotMet( + "Tensor is not Cuda IPC shared tensor. " + "Now only Tensor shared by cuda ipc could use this " + "api.")); + + size_t size = t[0].cast(); + auto dtype = + static_cast(t[1].cast()); + auto dims = phi::make_ddim(t[2].cast>()); + auto lod_info = t[3].cast(); + auto device_id = t[4].cast(); + + auto shared_reader_holder = + std::make_shared( + cuda_ipc_allocation->ptr(), + cuda_ipc_allocation->base_ptr(), size, + platform::CUDAPlace(device_id)); + + self.ResetHolderWithType(shared_reader_holder, dtype); + self.Resize(dims); + self.set_lod(lod_info); + + VLOG(6) << "Reconstructed tensor with buffer shared!"; + }, + R"DOC( + Deserialize GPU Tensor for existed shared Cuda IPC tensor. + + Params: + tensor: Shared Cuda IPC tensor. + tuple: contrains data size, data type, + tensor dims, lod information, device index. + + )DOC") + .def("_share_cuda", + [](framework::Tensor self) { + if (!self.IsInitialized() || self.numel() == 0) + throw std::runtime_error( + "Tensor not initialized or numel is 0. could not pass " + "to shared memory. "); + + auto *holder = dynamic_cast( + self.Holder().get()); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(holder->place()), true, + platform::errors::InvalidArgument( + "Tensor is not on GPU. share_cuda only support GPU " + "Tensor, share_filename is for CPU tensor.")); + + void *base_ptr = holder->base_ptr(); + ptrdiff_t offset_bytes = reinterpret_cast(holder->ptr()) - + reinterpret_cast(base_ptr); + + cudaIpcMemHandle_t handle; + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcGetMemHandle(&handle, base_ptr)); + + auto _handle = py::bytes(reinterpret_cast(&handle), + (py::ssize_t)CUDA_IPC_HANDLE_SIZE); + + // TODO(ZHUI): use cuda event, to avoid sync. + const auto &device_id = paddle::platform::GetCurrentDeviceId(); + auto stream = + paddle::platform::stream::get_current_stream(device_id); + stream->Synchronize(); + + int type_idx = static_cast(self.type()); + size_t data_size = + self.numel() * + framework::SizeOfType( + framework::TransToProtoVarType(self.type())); + + return py::make_tuple(_handle, (py::size_t)offset_bytes, data_size, + type_idx, vectorize(self.dims()), self.lod(), + device_id); + }, + R"DOC( + Serialize GPU Tensor by cudaIpcMemHandle. + + Returns: + tuple: contrains handle, data size, data type, + tensor dims, lod information, device index. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_cuda() + + )DOC") + .def("_new_shared_cuda", + [](py::tuple t) { + if (t.size() != 7) + throw std::runtime_error( + "Invalid Tensor meta info for shared cuda tensor!"); + + // 1. Create a new C++ instance + framework::Tensor tensor; + + // 2. Rebuild Allocation from handle + const std::string &handle = t[0].cast(); + ptrdiff_t offset_bytes = (ptrdiff_t)t[1].cast(); + auto device_id = t[6].cast(); + auto base_ptr = memory::allocation::GetIpcBasePtr(handle); + size_t size = t[2].cast(); + void *dev = base_ptr.get(); + dev = reinterpret_cast(dev) + offset_bytes; + + auto shared_reader_holder = + std::make_shared( + dev, size, device_id, std::move(base_ptr)); + + // 3. Rebuild Tensor + tensor.ResetHolderWithType( + shared_reader_holder, + static_cast(t[3].cast())); + tensor.Resize(phi::make_ddim(t[4].cast>())); + tensor.set_lod(t[5].cast()); + + return tensor; + }, + R"DOC( + Deserialize GPU lod tensor from cudaIpcMemHandle. + + Params: + tuple: contrains handle, data size, data type, + tensor dims, lod information, device index. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_cuda() + tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_cuda(metainfo)) + + )DOC") +#endif + .def("_share_filename", + [](framework::Tensor &self) { + if (!self.IsInitialized() || self.numel() == 0) + throw std::runtime_error( + "Tensor not initialized or numel is 0. could not pass to " + "shared memory. "); + + auto holder = self.Holder(); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(holder->place()) || + platform::is_cuda_pinned_place(holder->place()), + true, platform::errors::InvalidArgument( + "Tensor is not on CPU. share_filename only " + "support CPU Tensor.")); + + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + holder.get()); + // If the tensor is not shared, allocate memory map allocation. + if (mmap_allocation == nullptr) { + void *data_ptr = self.data(); + size_t data_size = + self.numel() * + framework::SizeOfType( + framework::TransToProtoVarType(self.type())); + + int flags = memory::allocation::MAPPED_SHAREDMEM | + memory::allocation::MAPPED_EXCLUSIVE; + std::string handle = memory::allocation::GetIPCName(); + auto shared_holder = + memory::allocation::AllocateRefcountedMemoryMapAllocation( + handle, flags, data_size); + + // copy data & reset holder + if (platform::is_cuda_pinned_place(holder->place())) { +#ifdef PADDLE_WITH_CUDA + memory::Copy(platform::CPUPlace(), shared_holder->ptr(), + platform::CUDAPinnedPlace(), data_ptr, data_size); +#endif + } else { + memory::Copy(platform::CPUPlace(), shared_holder->ptr(), + platform::CPUPlace(), data_ptr, data_size); + } + self.ResetHolder(shared_holder); + mmap_allocation = shared_holder.get(); + } + int type_idx = static_cast(self.type()); + + return py::make_tuple(mmap_allocation->ipc_name(), + mmap_allocation->size(), type_idx, + vectorize(self.dims()), self.lod()); + }, + R"DOC( + Serialize CPU lod tensor in shared memory to tuple. + If the tensor is not in shared memory, we will copy it first. + + Returns: + tuple: contrains ipc name, data size, data type, + tensor dims and lod imformation. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_filename() + + )DOC") + .def("_new_shared_filename", + [](py::tuple t) { // __setstate__ + if (t.size() != 5) + throw std::runtime_error("Invalid Tensor meta info state!"); + + framework::Tensor tensor; + + // 2. Rebuild Allocation + const std::string &ipc_name = t[0].cast(); + size_t size = t[1].cast(); + int flags = memory::allocation::MAPPED_SHAREDMEM | + memory::allocation::MAPPED_NOCREATE; + + auto shared_holder = + memory::allocation::AllocateRefcountedMemoryMapAllocation( + ipc_name, flags, size); + + // 3. Rebuild Tensor + tensor.ResetHolderWithType( + shared_holder, + static_cast(t[2].cast())); + tensor.Resize(phi::make_ddim(t[3].cast>())); + tensor.set_lod(t[4].cast()); + + return tensor; + }, + R"DOC( + Deserialize CPU lod tensor from shared memory. + + Params: + tuple: contrains ipc file name, data size, data type, + tensor dims and lod information. + + Examples: + .. code-block:: python + + import paddle + tensor = paddle.ones([3,3]) + metainfo = tensor.value().get_tensor()._share_filename() + tensor_from_shared = paddle.to_tensor(paddle.fluid.core.LoDTensor._new_shared_filename(metainfo)) + + )DOC") + .def("_shared_incref", + [](framework::Tensor &self) { + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + self.Holder().get()); + if (mmap_allocation) { + mmap_allocation->incref(); + } + }, + R"DOC( + Increase reference count of share_filename tensor. + )DOC") + .def("_shared_decref", + [](framework::Tensor &self) { + auto *mmap_allocation = dynamic_cast< + memory::allocation::RefcountedMemoryMapAllocation *>( + self.Holder().get()); + if (mmap_allocation) { + mmap_allocation->decref(); + } + }, + R"DOC( + Decrease reference count of share_filename tensor. + )DOC") + .def(py::pickle( + [](const framework::Tensor &t) { // __getstate__ + auto holder = t.Holder(); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true, + platform::errors::PreconditionNotMet( + "Tensor is not on CPU." + "Now only Tensor on CPU can be serialized.")); + auto *mmap_writer_allocation = + dynamic_cast( + holder.get()); + PADDLE_ENFORCE_NOT_NULL( + mmap_writer_allocation, + platform::errors::PreconditionNotMet( + "Tensor is not in shared memory." + "Now only Tensor on shared memory can be serialized.")); + int type_idx = static_cast(t.type()); + + return py::make_tuple(mmap_writer_allocation->ipc_name(), + mmap_writer_allocation->size(), type_idx, + vectorize(t.dims()), t.lod()); + }, + [](py::tuple t) { // __setstate__ + if (t.size() != 5) + throw std::runtime_error("Invalid Tensor state!"); + + // 1. Create a new C++ instance + framework::Tensor tensor; + + // 2. Rebuild Allocation + const std::string &ipc_name = t[0].cast(); + size_t size = t[1].cast(); + auto shared_reader_holder = + memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name, + size); + + // 3. Maintain global fd set + VLOG(3) << "Tensor ipc name: " << ipc_name; + memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); + + // 4. Rebuild Tensor + tensor.ResetHolderWithType( + shared_reader_holder, + static_cast(t[2].cast())); + tensor.Resize(phi::make_ddim(t[3].cast>())); + tensor.set_lod(t[4].cast()); + + return tensor; + })); +#endif + + py::class_(m, "SelectedRows") + .def("__init__", + [](phi::SelectedRows &instance) { + new (&instance) phi::SelectedRows(); + }) + .def("__init__", + [](phi::SelectedRows &instance, + const std::vector rows, + const int64_t &height) { + new (&instance) phi::SelectedRows(rows, height); + }) + .def( + "get_tensor", + [](phi::SelectedRows &self) { return self.mutable_value(); }, + py::return_value_policy::reference) + .def("numel", + [](phi::SelectedRows &self) -> int64_t { + return self.value().numel(); + }) + .def("set_height", &phi::SelectedRows::set_height) + .def("height", &phi::SelectedRows::height) + .def("set_rows", + [](phi::SelectedRows &self, std::vector rows) { +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) + self.set_rows(rows); +#else + Vector new_rows(rows); + self.set_rows(new_rows); +#endif + }) + .def("sync_index", + [](phi::SelectedRows &instance) { instance.SyncIndex(); }) + .def("rows", [](phi::SelectedRows &self) { + auto rows = self.rows(); + std::vector new_rows; + new_rows.reserve(rows.size()); + std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows)); + return new_rows; + }); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/tensor.h b/paddle/fluid/pybind/tensor.h new file mode 100644 index 0000000000000..a21236724b885 --- /dev/null +++ b/paddle/fluid/pybind/tensor.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" + +namespace paddle { +namespace pybind { + +void BindTensor(pybind11::module& m); // NOLINT + +} // namespace pybind +} // namespace paddle From 246ac9764de419985be99d00dc89eaeff1aca322 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Thu, 14 Jul 2022 20:44:04 +0800 Subject: [PATCH 210/250] [operator migration] Migrate infer shape for merged momentum (#44338) --- .../final_state_generator/codegen_utils.py | 1 + .../operators/optimizers/merged_momentum_op.cc | 12 +++++++++--- paddle/phi/api/lib/data_transform.cc | 10 ++++++++++ paddle/phi/api/lib/data_transform.h | 5 +++++ paddle/phi/api/yaml/generator/api_base.py | 2 ++ paddle/phi/infermeta/multiary.cc | 16 ++++++++++++++++ paddle/phi/infermeta/multiary.h | 16 ++++++++++++++++ 7 files changed, 59 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index cd5805740bef0..79f5da4bec79e 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -45,6 +45,7 @@ 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ 'str' : 'std::string', \ + 'str[]' : 'std::vector', 'float[]' : 'std::vector', \ 'Place' : 'paddle::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc index 220c0be9ddf0f..85b2f818fe137 100644 --- a/paddle/fluid/operators/optimizers/merged_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -22,8 +25,6 @@ class MergedMomentumOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *ctx) const override {} - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto param_dtype = @@ -100,6 +101,11 @@ class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(merged_momentum, + MergedMomentumInferShapeFunctor, + PD_INFER_META(phi::MergedMomentumInferMeta)); + REGISTER_OP_WITHOUT_GRADIENT(merged_momentum, ops::MergedMomentumOp, - ops::MergedMomentumOpMaker); + ops::MergedMomentumOpMaker, + MergedMomentumInferShapeFunctor); diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 4dafc7a7ee579..58795c0f06381 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -284,5 +284,15 @@ std::unique_ptr> PrepareData( return pt_tensors; } +paddle::optional> PrepareData( + const paddle::optional>& inputs, + const phi::TensorArgDef& target_args_def, + const TransformFlag& transform_flag) { + if (inputs) { + return {*PrepareData(*inputs, target_args_def, transform_flag)}; + } + return paddle::none; +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h index 4d70078ef3444..3feba2465f61b 100644 --- a/paddle/phi/api/lib/data_transform.h +++ b/paddle/phi/api/lib/data_transform.h @@ -76,5 +76,10 @@ std::unique_ptr> PrepareData( const phi::TensorArgDef& target_args_def, const TransformFlag& transform_flag); +paddle::optional> PrepareData( + const paddle::optional>& inputs, + const phi::TensorArgDef& target_args_def, + const TransformFlag& transform_flag); + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py index aacb4ce55befa..2659d80615f2d 100644 --- a/paddle/phi/api/yaml/generator/api_base.py +++ b/paddle/phi/api/yaml/generator/api_base.py @@ -131,9 +131,11 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]): 'long': 'long', 'size_t': 'size_t', 'float': 'float', + 'float[]': 'const std::vector&', 'double': 'double', 'bool': 'bool', 'str': 'const std::string&', + 'str[] ': 'const std::vector&', 'Place': 'const Place&', 'DataLayout': 'DataLayout', 'DataType': 'DataType', diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 575e60923cd21..3369b0c392ec3 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1549,6 +1549,22 @@ void MergedAdamInferMeta( std::vector beta2_pow_out, std::vector master_param_out) {} +void MergedMomentumInferMeta( + const std::vector& param, + const std::vector& grad, + const std::vector& velocity, + const std::vector& learning_rate, + const paddle::optional>& master_param, + float mu, + bool use_nesterov, + const std::vector& regularization_method, + const std::vector& regularization_coeff, + bool multi_precision, + float rescale_grad, + std::vector param_out, + std::vector velocity_out, + std::vector master_param_out) {} + void MeshgridInferMeta(const std::vector& inputs, std::vector outputs) { const size_t inputs_num = inputs.size(); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index c0972816f3ba2..0ec71e86893c3 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -255,6 +255,22 @@ void MergedAdamInferMeta( std::vector beta2_pow_out, std::vector master_param_out); +void MergedMomentumInferMeta( + const std::vector& param, + const std::vector& grad, + const std::vector& velocity, + const std::vector& learning_rate, + const paddle::optional>& master_param, + float mu, + bool use_nesterov, + const std::vector& regularization_method, + const std::vector& regularization_coeff, + bool multi_precision, + float rescale_grad, + std::vector param_out, + std::vector velocity_out, + std::vector master_param_out); + void MeshgridInferMeta(const std::vector& inputs, std::vector outputs); From dc5a04202db0feb34e7949642d90e1219b2d6e10 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 14 Jul 2022 08:01:02 -0500 Subject: [PATCH 211/250] refine allocation cmake (#44241) * build into one static library * move memory/detail to memory/allocation * fix bug * fix profiler * fix framework_proto * fix deps * fix inference compilation * fix rocm compile * follow comments * fix buddy_allocator_test --- cmake/inference_lib.cmake | 6 +- paddle/fluid/memory/CMakeLists.txt | 3 +- paddle/fluid/memory/allocation/CMakeLists.txt | 319 +++++++----------- .../{detail => allocation}/buddy_allocator.cc | 2 +- .../{detail => allocation}/buddy_allocator.h | 4 +- .../buddy_allocator_test.cc | 4 +- .../{detail => allocation}/memory_block.cc | 2 +- .../{detail => allocation}/memory_block.h | 0 .../memory_block_desc.cc | 2 +- .../{detail => allocation}/meta_cache.cc | 2 +- .../allocation/naive_best_fit_allocator.cc | 4 +- .../system_allocator.cc | 2 +- .../{detail => allocation}/system_allocator.h | 0 .../system_allocator_test.cc | 2 +- .../allocation/thread_local_allocator.h | 4 +- paddle/fluid/memory/detail/CMakeLists.txt | 79 ----- paddle/fluid/memory/pinned_memory_test.cu | 2 +- paddle/fluid/platform/CMakeLists.txt | 4 +- .../platform/device/gpu/cuda/CMakeLists.txt | 2 +- paddle/fluid/platform/profiler.cc | 2 + paddle/fluid/platform/profiler.h | 5 +- paddle/fluid/pybind/CMakeLists.txt | 4 - 22 files changed, 144 insertions(+), 310 deletions(-) rename paddle/fluid/memory/{detail => allocation}/buddy_allocator.cc (99%) rename paddle/fluid/memory/{detail => allocation}/buddy_allocator.h (97%) rename paddle/fluid/memory/{detail => allocation}/buddy_allocator_test.cc (99%) rename paddle/fluid/memory/{detail => allocation}/memory_block.cc (98%) rename paddle/fluid/memory/{detail => allocation}/memory_block.h (100%) rename paddle/fluid/memory/{detail => allocation}/memory_block_desc.cc (97%) rename paddle/fluid/memory/{detail => allocation}/meta_cache.cc (97%) rename paddle/fluid/memory/{detail => allocation}/system_allocator.cc (99%) rename paddle/fluid/memory/{detail => allocation}/system_allocator.h (100%) rename paddle/fluid/memory/{detail => allocation}/system_allocator_test.cc (97%) delete mode 100644 paddle/fluid/memory/detail/CMakeLists.txt diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 56345373dbe8c..865dd8643d867 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -427,10 +427,8 @@ copy( set(module "memory") copy( fluid_lib_dist - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h - ${src_dir}/${module}/allocation/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail - ${dst_dir}/${module}/allocation) + SRCS ${src_dir}/${module}/allocation/*.h + DSTS ${dst_dir}/${module}/allocation) set(module "platform") set(platform_lib_deps profiler_proto errors) diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 5d1f97c096bdd..eccba465051b9 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -1,4 +1,3 @@ -add_subdirectory(detail) add_subdirectory(allocation) if(WITH_MKLDNN) @@ -10,7 +9,7 @@ endif() cc_library( malloc SRCS malloc.cc - DEPS place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS}) + DEPS place enforce allocator profiler ${MKLDNN_CTX_DEPS}) cc_library( memcpy SRCS memcpy.cc diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 46a46b04b3e0c..ec8391469f94c 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -1,264 +1,179 @@ -cc_library( - allocator - SRCS allocator.cc - DEPS place stats profiler) -cc_library( - cpu_allocator - SRCS cpu_allocator.cc - DEPS allocator) -cc_library( - locked_allocator - SRCS locked_allocator.cc - DEPS allocator) -cc_library( - buffered_allocator - SRCS buffered_allocator.cc - DEPS allocator) -cc_library( - best_fit_allocator - SRCS best_fit_allocator.cc - DEPS allocator) -cc_library( - naive_best_fit_allocator - SRCS naive_best_fit_allocator.cc - DEPS allocator buddy_allocator) -cc_test( - naive_best_fit_allocator_test - SRCS naive_best_fit_allocator_test.cc - DEPS naive_best_fit_allocator) -cc_test( - buffered_allocator_test - SRCS buffered_allocator_test.cc - DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator) +include(ExternalProject) + +set(ALLOCATOR_DEPS place stats profiler) +set(ALLOCATOR_SRCS + allocator.cc + cpu_allocator.cc + locked_allocator.cc + aligned_allocator.cc + buffered_allocator.cc + best_fit_allocator.cc + naive_best_fit_allocator.cc + allocator_strategy.cc + allocator_facade.cc + auto_growth_best_fit_allocator.cc + virtual_memory_auto_growth_best_fit_allocator.cc + retry_allocator.cc + memory_block.cc + memory_block_desc.cc + meta_cache.cc + buddy_allocator.cc + system_allocator.cc) -if(WITH_MKLDNN) - set(MKLDNN_CTX_DEPS mkldnn) -else() - set(MKLDNN_CTX_DEPS) +if(WITH_GPU OR WITH_ROCM) + list( + APPEND + ALLOCATOR_SRCS + cuda_allocator.cc + cuda_managed_allocator.cc + pinned_allocator.cc + stream_safe_cuda_allocator.cc + thread_local_allocator.cc) + list(APPEND ALLOCATOR_DEPS cuda_device_guard gpu_info dynload_cuda) endif() if(WITH_GPU) - nv_library( - cuda_allocator - SRCS cuda_allocator.cc - DEPS allocator cuda_device_guard stats) - nv_library( - cuda_managed_allocator - SRCS cuda_managed_allocator.cc - DEPS allocator cuda_device_guard gpu_info) - nv_library( - pinned_allocator - SRCS pinned_allocator.cc - DEPS allocator) - nv_library( - stream_safe_cuda_allocator - SRCS stream_safe_cuda_allocator.cc - DEPS allocator cuda_graph) - nv_library( - thread_local_allocator - SRCS thread_local_allocator.cc - DEPS allocator) + list(APPEND ALLOCATOR_DEPS cuda_graph) +endif() - cc_test( - thread_local_allocator_test - SRCS thread_local_allocator_test.cc - DEPS thread_local_allocator) - if(CUDA_VERSION GREATER_EQUAL 10.2) - nv_library( - cuda_virtual_mem_allocator - SRCS cuda_virtual_mem_allocator.cc - DEPS dynload_cuda) +if(CUDA_VERSION VERSION_GREATER_EQUAL 10.2) + list(APPEND ALLOCATOR_SRCS cuda_virtual_mem_allocator.cc) +endif() + +if(NOT WIN32) + list(APPEND ALLOCATOR_SRCS mmap_allocator.cc) + if(WITH_GPU) + list(APPEND ALLOCATOR_SRCS cuda_ipc_allocator.cc) endif() endif() -if(WITH_ROCM) - hip_library( - cuda_allocator - SRCS cuda_allocator.cc - DEPS allocator cuda_device_guard stats) - hip_library( - cuda_managed_allocator - SRCS cuda_managed_allocator.cc - DEPS allocator cuda_device_guard gpu_info) - hip_library( - pinned_allocator - SRCS pinned_allocator.cc - DEPS allocator) - hip_library( - stream_safe_cuda_allocator - SRCS stream_safe_cuda_allocator.cc - DEPS allocator) - hip_library( - thread_local_allocator - SRCS thread_local_allocator.cc - DEPS allocator) +if(WITH_ASCEND_CL) + list(APPEND ALLOCATOR_SRCS npu_allocator.cc npu_pinned_allocator.cc) + list(APPEND ALLOCATOR_DEPS npu_info) +endif() - cc_test( - thread_local_allocator_test - SRCS thread_local_allocator_test.cc - DEPS thread_local_allocator) +if(WITH_CUSTOM_DEVICE) + list(APPEND ALLOCATOR_SRCS custom_allocator.cc) + list(APPEND ALLOCATOR_DEPS device_manager) endif() -if(WITH_ASCEND_CL) - cc_library( - npu_allocator - SRCS npu_allocator.cc - DEPS allocator npu_info) - cc_library( - npu_pinned_allocator - SRCS npu_pinned_allocator.cc - DEPS allocator npu_info) +if(WITH_XPU) + list(APPEND ALLOCATOR_DEPS xpu_info) endif() -cc_library( - retry_allocator - SRCS retry_allocator.cc +if(WITH_IPU) + list(APPEND ALLOCATOR_DEPS ipu_info) +endif() + +add_library(allocator "${ALLOCATOR_SRCS}") +target_link_libraries(allocator ${ALLOCATOR_DEPS}) +# note: why only add dependency for framework_proto. +# Because it is needed to generate framework.pb.h used in some header files. +add_dependencies(allocator framework_proto) +set_property(GLOBAL PROPERTY FLUID_MODULES allocator) + +cc_test( + naive_best_fit_allocator_test + SRCS naive_best_fit_allocator_test.cc + DEPS allocator) +cc_test( + buffered_allocator_test + SRCS buffered_allocator_test.cc DEPS allocator) -if(WITH_GPU OR WITH_ROCM) - set(AllocatorFacadeDeps - gpu_info - cuda_allocator - cuda_managed_allocator - pinned_allocator - cuda_device_guard - thread_local_allocator - stream_safe_cuda_allocator - device_context) - if(CUDA_VERSION GREATER_EQUAL 10.2) - list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator) - endif() -elseif(WITH_XPU) - set(AllocatorFacadeDeps xpu_info) -elseif(WITH_IPU) - set(AllocatorFacadeDeps ipu_info) -elseif(WITH_ASCEND) - set(AllocatorFacadeDeps ascend_npu_info) -else() - set(AllocatorFacadeDeps) +if(WITH_GPU) + nv_test( + thread_local_allocator_test + SRCS thread_local_allocator_test.cc + DEPS allocator) endif() - -if(WITH_CUSTOM_DEVICE) - cc_library( - custom_allocator - SRCS custom_allocator.cc - DEPS allocator device_manager) - set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator) +if(WITH_ROCM) + hip_test( + thread_local_allocator_test + SRCS thread_local_allocator_test.cc + DEPS allocator) endif() if(WITH_GPU) nv_test( best_fit_allocator_test SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu - DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator - device_context memcpy) + DEPS allocator memcpy) elseif(WITH_ROCM) hip_test( best_fit_allocator_test SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu - DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator - device_context memcpy) + DEPS allocator memcpy) else() cc_test( best_fit_allocator_test SRCS best_fit_allocator_test.cc - DEPS best_fit_allocator locked_allocator cpu_allocator) -endif() - -list( - APPEND - AllocatorFacadeDeps - cpu_allocator - locked_allocator - aligned_allocator - retry_allocator - buffered_allocator - naive_best_fit_allocator - auto_growth_best_fit_allocator - virtual_memory_auto_growth_best_fit_allocator - best_fit_allocator) - -if(WITH_ASCEND_CL) - list(APPEND AllocatorFacadeDeps npu_pinned_allocator) + DEPS allocator) endif() -cc_library( - aligned_allocator - SRCS aligned_allocator.cc - DEPS allocator) cc_test( test_aligned_allocator SRCS test_aligned_allocator.cc - DEPS aligned_allocator) -cc_library( - allocator_strategy - SRCS allocator_strategy.cc - DEPS gflags ${AllocatorFacadeDeps}) -cc_library( - allocator_facade - SRCS allocator_facade.cc - DEPS allocator_strategy stats) - -if(WITH_GPU) - target_link_libraries(allocator_facade cuda_graph) -endif() + DEPS allocator) cc_test( retry_allocator_test SRCS retry_allocator_test.cc - DEPS retry_allocator locked_allocator cpu_allocator) -if(WITH_TESTING) - if((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test) - target_link_libraries(retry_allocator_test cuda_allocator) - endif() - - if(TEST retry_allocator_test) - set_tests_properties(retry_allocator_test PROPERTIES LABELS - "RUN_TYPE=EXCLUSIVE") - endif() + DEPS allocator) +if(TEST retry_allocator_test) + set_tests_properties(retry_allocator_test PROPERTIES LABELS + "RUN_TYPE=EXCLUSIVE") endif() cc_test( allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc - DEPS allocator_facade) + DEPS allocator) cc_test( allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc - DEPS allocator_facade) + DEPS allocator) -cc_library( - auto_growth_best_fit_allocator - SRCS auto_growth_best_fit_allocator.cc - DEPS allocator aligned_allocator flags) cc_test( auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc - DEPS cpu_allocator auto_growth_best_fit_allocator) + DEPS allocator) cc_test( auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc - DEPS auto_growth_best_fit_allocator) - -cc_library( - virtual_memory_auto_growth_best_fit_allocator - SRCS virtual_memory_auto_growth_best_fit_allocator.cc - DEPS allocator aligned_allocator) + DEPS allocator) if(NOT WIN32) - cc_library( - mmap_allocator - SRCS mmap_allocator.cc - DEPS allocator) cc_test( mmap_allocator_test SRCS mmap_allocator_test.cc - DEPS mmap_allocator allocator) - if(WITH_GPU) - cc_library( - cuda_ipc_allocator - SRCS cuda_ipc_allocator.cc - DEPS allocator) + DEPS allocator) +endif() + +cc_test( + system_allocator_test + SRCS system_allocator_test.cc + DEPS allocator) + +cc_test( + buddy_allocator_test + SRCS buddy_allocator_test.cc + DEPS allocator) + +if(WITH_TESTING) + if(TEST buddy_allocator_test) + set_tests_properties(buddy_allocator_test PROPERTIES LABELS + "RUN_TYPE=EXCLUSIVE") + endif() + + # TODO(zhiqiu): why not win32? because wget is not found on windows + if(NOT WIN32) + add_custom_target( + download_data + COMMAND wget -nc + https://paddle-ci.cdn.bcebos.com/buddy_allocator_test_data.tar + COMMAND tar -xf buddy_allocator_test_data.tar) + add_dependencies(buddy_allocator_test download_data) endif() endif() diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc similarity index 99% rename from paddle/fluid/memory/detail/buddy_allocator.cc rename to paddle/fluid/memory/allocation/buddy_allocator.cc index 90cce14c5676c..907fd37e44205 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "paddle/fluid/memory/allocation/buddy_allocator.h" #include diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/allocation/buddy_allocator.h similarity index 97% rename from paddle/fluid/memory/detail/buddy_allocator.h rename to paddle/fluid/memory/allocation/buddy_allocator.h index 463e3cfcf6d8d..5e39e21c9664f 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/allocation/buddy_allocator.h @@ -25,8 +25,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/memory/detail/memory_block.h" -#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/memory/allocation/memory_block.h" +#include "paddle/fluid/memory/allocation/system_allocator.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/npu/npu_info.h" diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/allocation/buddy_allocator_test.cc similarity index 99% rename from paddle/fluid/memory/detail/buddy_allocator_test.cc rename to paddle/fluid/memory/allocation/buddy_allocator_test.cc index ab558e8bfce15..ad53a784502b4 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buddy_allocator_test.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "paddle/fluid/memory/allocation/buddy_allocator.h" #include @@ -330,7 +330,7 @@ TEST(BuddyAllocator, SpeedAna) { std::vector vec_free_flag; std::string line; - int size, id; + int size = 0, id = 0; while (in_file >> size >> id) { vec_size.push_back(size); vec_pos.push_back(id); diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/allocation/memory_block.cc similarity index 98% rename from paddle/fluid/memory/detail/memory_block.cc rename to paddle/fluid/memory/allocation/memory_block.cc index 52f7d33aae1d3..0f0a81cf9d118 100644 --- a/paddle/fluid/memory/detail/memory_block.cc +++ b/paddle/fluid/memory/allocation/memory_block.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/allocation/memory_block.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/memory/detail/memory_block.h b/paddle/fluid/memory/allocation/memory_block.h similarity index 100% rename from paddle/fluid/memory/detail/memory_block.h rename to paddle/fluid/memory/allocation/memory_block.h diff --git a/paddle/fluid/memory/detail/memory_block_desc.cc b/paddle/fluid/memory/allocation/memory_block_desc.cc similarity index 97% rename from paddle/fluid/memory/detail/memory_block_desc.cc rename to paddle/fluid/memory/allocation/memory_block_desc.cc index 93d2559c37f77..d20d56a6d05e8 100644 --- a/paddle/fluid/memory/detail/memory_block_desc.cc +++ b/paddle/fluid/memory/allocation/memory_block_desc.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/allocation/memory_block.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/allocation/meta_cache.cc similarity index 97% rename from paddle/fluid/memory/detail/meta_cache.cc rename to paddle/fluid/memory/allocation/meta_cache.cc index 4831e005c84c0..945b0f7b89283 100644 --- a/paddle/fluid/memory/detail/meta_cache.cc +++ b/paddle/fluid/memory/allocation/meta_cache.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "glog/logging.h" -#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/allocation/memory_block.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index d696b8bffda08..d1a3b77e7720b 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -18,8 +18,8 @@ #include "gflags/gflags.h" #include "glog/logging.h" -#include "paddle/fluid/memory/detail/buddy_allocator.h" -#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/memory/allocation/buddy_allocator.h" +#include "paddle/fluid/memory/allocation/system_allocator.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc similarity index 99% rename from paddle/fluid/memory/detail/system_allocator.cc rename to paddle/fluid/memory/allocation/system_allocator.cc index eb5c74e56d61f..fcfece978cb7f 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/allocation/system_allocator.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES -#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/memory/allocation/system_allocator.h" #include "paddle/fluid/memory/stats.h" diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h similarity index 100% rename from paddle/fluid/memory/detail/system_allocator.h rename to paddle/fluid/memory/allocation/system_allocator.h diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/allocation/system_allocator_test.cc similarity index 97% rename from paddle/fluid/memory/detail/system_allocator_test.cc rename to paddle/fluid/memory/allocation/system_allocator_test.cc index dbf3fad6c3373..4749ff3f8adb7 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/allocation/system_allocator_test.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/memory/allocation/system_allocator.h" #include diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h index 3b71ec866b663..a2c9e813f7ac6 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.h +++ b/paddle/fluid/memory/allocation/thread_local_allocator.h @@ -18,8 +18,8 @@ #include #include "paddle/fluid/memory/allocation/allocator.h" -#include "paddle/fluid/memory/detail/buddy_allocator.h" -#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/memory/allocation/buddy_allocator.h" +#include "paddle/fluid/memory/allocation/system_allocator.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" namespace paddle { diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt deleted file mode 100644 index afe5c0dba0f3b..0000000000000 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ /dev/null @@ -1,79 +0,0 @@ -include(ExternalProject) - -cc_library( - memory_block - SRCS memory_block.cc memory_block_desc.cc meta_cache.cc - DEPS place) - -if(WITH_GPU) - nv_library( - system_allocator - SRCS system_allocator.cc - DEPS gflags cpu_info gpu_info place) -elseif(WITH_ROCM) - hip_library( - system_allocator - SRCS system_allocator.cc - DEPS gflags cpu_info gpu_info place) -elseif(${WITH_ASCEND_CL}) - cc_library( - system_allocator - SRCS system_allocator.cc - DEPS gflags cpu_info npu_info place) -elseif(WITH_MLU) - cc_library( - system_allocator - SRCS system_allocator.cc - DEPS gflags cpu_info mlu_info place) -else() - cc_library( - system_allocator - SRCS system_allocator.cc - DEPS gflags cpu_info place) -endif() - -cc_test( - system_allocator_test - SRCS system_allocator_test.cc - DEPS system_allocator) - -cc_library( - buddy_allocator - SRCS buddy_allocator.cc - DEPS memory_block system_allocator glog) - -cc_test( - buddy_allocator_test - SRCS buddy_allocator_test.cc - DEPS buddy_allocator) - -function(file_download_and_uncompress URL NAME) - message(STATUS "Download dependence[${NAME}] from ${URL}") - set(${NAME}_INCLUDE_DIR - ${THIRD_PARTY_PATH}/${NAME} - PARENT_SCOPE) - ExternalProject_Add( - extern_download_${NAME} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${THIRD_PARTY_PATH}/${NAME} - URL ${URL} - DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME} - SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - UPDATE_COMMAND "" - INSTALL_COMMAND "") - set(third_party_deps - ${third_party_deps} extern_download_${NAME} - PARENT_SCOPE) -endfunction() - -if(WITH_TESTING) - if(TEST buddy_allocator_test) - set_tests_properties(buddy_allocator_test PROPERTIES LABELS - "RUN_TYPE=EXCLUSIVE") - endif() - set(URL "https://paddle-ci.cdn.bcebos.com/buddy_allocator_test_data.tar") - file_download_and_uncompress(URL "buddy_allocator") -endif() diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu index 5b982f62c86de..259222754e8f8 100644 --- a/paddle/fluid/memory/pinned_memory_test.cu +++ b/paddle/fluid/memory/pinned_memory_test.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include -#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/allocation/memory_block.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/cpu_info.h" diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 2ff31aa5b54fc..2374cfdfd3426 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -88,12 +88,12 @@ if(WITH_GPU) nv_library( cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc - DEPS device_context allocator_facade cuda_graph) + DEPS device_context allocator cuda_graph) else() cc_library( cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc - DEPS device_context allocator_facade) + DEPS device_context allocator) endif() cc_library( diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt index 15c7a6c462495..64a2f891c21cd 100644 --- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt @@ -1,7 +1,7 @@ nv_library( cuda_graph SRCS cuda_graph.cc - DEPS enforce allocator_facade) + DEPS enforce) nv_library( cuda_profiler SRCS cuda_profiler.cc diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 38471251ff4a1..d02fd54578862 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -310,8 +310,10 @@ RecordOpInfoSupplement::RecordOpInfoSupplement( std::map>> RecordMemEvent::size_cache; + std::map> RecordMemEvent::has_initialized; + RecordMemEvent::RecordMemEvent(const void *ptr, const phi::Place &place, size_t size, diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 4773b1a177ba0..6046e54b6c876 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -28,7 +28,6 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/mem_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" @@ -39,6 +38,10 @@ limitations under the License. */ namespace paddle { namespace platform { +namespace proto { +class Profile; +} + const int kEnableProfiler = 1; const int kDisableProfiler = 2; diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index f301189d77824..63ebffe9f25f1 100755 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -84,10 +84,6 @@ endif() if(NOT WIN32) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) - set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator) - if(WITH_GPU) - set(PYBIND_DEPS ${PYBIND_DEPS} cuda_ipc_allocator) - endif() if(WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) From c6272b6abc367f397a6a09da5518b1b641652f9b Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Thu, 14 Jul 2022 21:04:47 +0800 Subject: [PATCH 212/250] Some Ops support fp16 (#44295) * sparse support amp * EagerAmpAutoCasts support sparse --- paddle/fluid/eager/eager_amp_auto_cast.h | 35 ++++++++++++------- paddle/phi/kernels/gpu/pad3d_grad_kernel.cu | 9 +++-- paddle/phi/kernels/sparse/empty_kernel.cc | 2 ++ .../kernels/sparse/gpu/unary_grad_kernel.cu | 4 +++ paddle/phi/kernels/sparse/gpu/unary_kernel.cu | 4 +++ 5 files changed, 40 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h index 26af2b98ca0ab..f98f25635f703 100644 --- a/paddle/fluid/eager/eager_amp_auto_cast.h +++ b/paddle/fluid/eager/eager_amp_auto_cast.h @@ -39,6 +39,27 @@ static inline bool NeedCast(const paddle::experimental::Tensor& tensor, return false; } +inline paddle::experimental::Tensor Cast( + const paddle::experimental::Tensor& input, + const paddle::experimental::DataType& dst_dtype, + const bool trace_backward = true) { + if (input.is_sparse_coo_tensor() || input.is_sparse_csr_tensor()) { + if (trace_backward) { + return sparse::cast_final_state_dygraph_function( + input, paddle::experimental::DataType::UNDEFINED, dst_dtype); + } else { + return paddle::experimental::sparse::cast( + input, paddle::experimental::DataType::UNDEFINED, dst_dtype); + } + } else { + if (trace_backward) { + return cast_final_state_dygraph_function(input, dst_dtype); + } else { + return paddle::experimental::cast(input, dst_dtype); + } + } +} + inline std::vector EagerAmpAutoCasts( const std::string& inputs_name, const std::vector& inputs, @@ -51,13 +72,7 @@ inline std::vector EagerAmpAutoCasts( std::vector inputs_casted; for (auto& input : inputs) { if (NeedCast(input, dst_dtype)) { - if (trace_backward) { - inputs_casted.emplace_back( - std::move(cast_final_state_dygraph_function(input, dst_dtype))); - } else { - inputs_casted.emplace_back( - std::move(paddle::experimental::cast(input, dst_dtype))); - } + inputs_casted.emplace_back(std::move(Cast(input, dst_dtype))); } else { inputs_casted.emplace_back(input); } @@ -92,11 +107,7 @@ inline paddle::experimental::Tensor EagerAmpAutoCast( } } if (NeedCast(input, dst_dtype)) { - if (trace_backward) { - return cast_final_state_dygraph_function(input, dst_dtype); - } else { - return paddle::experimental::cast(input, dst_dtype); - } + return Cast(input, dst_dtype, trace_backward); } return input; } diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu index 8f4af0a450890..e9f820a318482 100644 --- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu @@ -503,5 +503,10 @@ void Pad3dGradKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_KERNEL( - pad3d_grad, GPU, ALL_LAYOUT, phi::Pad3dGradKernel, float, double) {} +PD_REGISTER_KERNEL(pad3d_grad, + GPU, + ALL_LAYOUT, + phi::Pad3dGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc index fe7fb72b4caa6..c1706b9919d90 100644 --- a/paddle/phi/kernels/sparse/empty_kernel.cc +++ b/paddle/phi/kernels/sparse/empty_kernel.cc @@ -97,6 +97,7 @@ PD_REGISTER_KERNEL(empty_like_coo, GPU, ALL_LAYOUT, phi::sparse::EmptyLikeCooKernel, + phi::dtype::float16, float, double, int8_t, @@ -112,6 +113,7 @@ PD_REGISTER_KERNEL(empty_like_csr, GPU, ALL_LAYOUT, phi::sparse::EmptyLikeCsrKernel, + phi::dtype::float16, float, double, int8_t, diff --git a/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu index c1f2b2a1f0d1d..be0f13fb0e538 100644 --- a/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/unary_grad_kernel.cu @@ -23,6 +23,7 @@ GPU, \ ALL_LAYOUT, \ phi::sparse::prefix##CooGradKernel, \ + phi::dtype::float16, \ float, \ double) { \ kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ @@ -32,6 +33,7 @@ GPU, \ ALL_LAYOUT, \ phi::sparse::prefix##CsrGradKernel, \ + phi::dtype::float16, \ float, \ double) { \ kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ @@ -56,6 +58,7 @@ PD_REGISTER_KERNEL(cast_coo_grad, GPU, ALL_LAYOUT, phi::sparse::CastCooGradKernel, + phi::dtype::float16, float, double, int8_t, @@ -69,6 +72,7 @@ PD_REGISTER_KERNEL(cast_csr_grad, GPU, ALL_LAYOUT, phi::sparse::CastCsrGradKernel, + phi::dtype::float16, float, double, int8_t, diff --git a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu index fdf0b5106d3cf..6358b7b983576 100644 --- a/paddle/phi/kernels/sparse/gpu/unary_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/unary_kernel.cu @@ -67,6 +67,7 @@ void DivCsrScalarKernel(const Context& dev_ctx, GPU, \ ALL_LAYOUT, \ phi::sparse::prefix##CooKernel, \ + phi::dtype::float16, \ float, \ double) { \ kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); \ @@ -76,6 +77,7 @@ void DivCsrScalarKernel(const Context& dev_ctx, GPU, \ ALL_LAYOUT, \ phi::sparse::prefix##CsrKernel, \ + phi::dtype::float16, \ float, \ double) { \ kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); \ @@ -119,6 +121,7 @@ PD_REGISTER_KERNEL(cast_coo, GPU, ALL_LAYOUT, phi::sparse::CastCooKernel, + phi::dtype::float16, float, double, int8_t, @@ -132,6 +135,7 @@ PD_REGISTER_KERNEL(cast_csr, GPU, ALL_LAYOUT, phi::sparse::CastCsrKernel, + phi::dtype::float16, float, double, int8_t, From ec38be670d1f21b39b0dd87bbba621766f59ac64 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 14 Jul 2022 21:11:44 +0800 Subject: [PATCH 213/250] [ Dy2Static ] fix the outputs of net is x,x (#44313) * fix the outputs of net is x,x * add unittest for duplicate output * fix --- .../dygraph_to_static/partial_program.py | 8 +++ .../test_duplicate_output.py | 65 +++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_duplicate_output.py diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py index 318585972f0e6..4faa4a098e016 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py @@ -441,11 +441,18 @@ def _prepare(self, inputs): continue input_vars.append(var) + # mapping from name(string) -> VarBase + out_varbase_map = {} + def create_out(var_id): var = self._outputs[var_id] assert isinstance(var, framework.Variable) var_desc = var.desc varbase = None + + if var_desc.name() in out_varbase_map: + return out_varbase_map[var_desc.name()] + if not framework._in_eager_mode_: var_base = core.VarBase(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) @@ -453,6 +460,7 @@ def create_out(var_id): var_base = core.eager.Tensor(var_desc.dtype(), var_desc.shape(), var_desc.name(), var_desc.type(), False) + out_varbase_map[var_desc.name()] = var_base return var_base # Create VarBase to receive output data. diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_duplicate_output.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_duplicate_output.py new file mode 100644 index 0000000000000..aea7a1910b0b6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_duplicate_output.py @@ -0,0 +1,65 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest + +import paddle + +np.random.seed(1) + +if paddle.fluid.is_compiled_with_cuda(): + place = paddle.fluid.CUDAPlace(0) +else: + place = paddle.fluid.CPUPlace() + + +class SimpleNet(paddle.nn.Layer): + + def __init__(self): + super().__init__() + self._linear = paddle.nn.Linear(1, 1) + + def forward(self, x): + """ forward with duplicate outputs. + """ + x = self._linear(x) + return x, x + + +class TestDuplicateOutput(unittest.TestCase): + """ + TestCase for the transformation from control flow `if/else` + dependent on tensor in Dygraph into Static `fluid.layers.cond`. + """ + + def setUp(self): + self.net = paddle.jit.to_static(SimpleNet()) + self.x = paddle.to_tensor([1.0]) + + def _run_static(self): + loss0, loss1 = self.net(self.x) + loss0.backward() + param = self.net.parameters() + self.assertEqual(param[0].grad.numpy(), 1.0) + + def test_ast_to_func(self): + self._run_static() + + +if __name__ == '__main__': + with paddle.fluid.framework._test_eager_guard(): + unittest.main() From 1f7f7193bbff55dbb5acd0e16e3c7b33f3e31653 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Fri, 15 Jul 2022 12:20:30 +0800 Subject: [PATCH 214/250] fix python3.10 compile bug on window (#44330) --- paddle/fluid/pybind/op_function_generator.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 7eeadac7cef2e..f659a671c3947 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -553,6 +553,11 @@ int main(int argc, char* argv[]) { std::ofstream out(path + "op_function" + std::to_string(i + 1) + ".cc.tmp", std::ios::out); + out << "#if defined(_MSC_VER)\n" + << "#include \n" + << "typedef SSIZE_T ssize_t;\n" + << "#endif\n"; + for (auto& header : headers) { out << "#include " + header + "\n"; } From f91308356a7ec8a85c7a9946437b021973588a9a Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Fri, 15 Jul 2022 13:38:13 +0800 Subject: [PATCH 215/250] Fix random seed for several unit tests (#44135) * Fix test_functional_conv2d_transpose random seed * Fix random seed and use np.testing * Fix random seed for test_lu_unpack_op * Fix test_autograd_functional_dynamic random seed --- .../test_autograd_functional_dynamic.py | 1 + .../test_functional_conv2d_transpose.py | 2 ++ .../tests/unittests/test_lu_unpack_op.py | 3 +++ .../fluid/tests/unittests/test_variable.py | 21 ++++++++++++------- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py index 6c67b78d6a539..4b61580452592 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py @@ -676,4 +676,5 @@ def test_all_cases(self): if __name__ == "__main__": + np.random.seed(2022) unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py index d1b9c68925747..dce6a37c6bbb8 100644 --- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py +++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py @@ -39,6 +39,7 @@ def setUp(self): self.groups = 1 self.no_bias = False self.data_format = "NHWC" + np.random.seed(2022) def prepare(self): if isinstance(self.filter_shape, int): @@ -188,6 +189,7 @@ def setUp(self): self.groups = 1 self.no_bias = False self.data_format = "NHWC" + np.random.seed(2022) def test_exception(self): self.prepare() diff --git a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py index 1757adef8e36f..97773c70e177a 100644 --- a/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py +++ b/python/paddle/fluid/tests/unittests/test_lu_unpack_op.py @@ -190,6 +190,9 @@ def config(self): class TestLU_UnpackAPI(unittest.TestCase): + def setUp(self): + np.random.seed(2022) + def test_dygraph(self): def run_lu_unpack_dygraph(shape, dtype): diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index 87802b83415d6..5fb220da609a4 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -30,6 +30,9 @@ class TestVariable(unittest.TestCase): + def setUp(self): + np.random.seed(2022) + def test_np_dtype_convert(self): DT = core.VarDesc.VarType convert = convert_np_dtype_to_dtype_ @@ -486,6 +489,9 @@ def test_detach(self): class TestVariableSlice(unittest.TestCase): + def setUp(self): + np.random.seed(2022) + def _test_item_none(self, place): data = np.random.rand(2, 3, 4).astype("float32") prog = paddle.static.Program() @@ -545,6 +551,9 @@ def test_slice(self): class TestListIndex(unittest.TestCase): + def setUp(self): + np.random.seed(2022) + def numel(self, shape): return reduce(lambda x, y: x * y, shape) @@ -723,10 +732,10 @@ def run_getitem_list_index(self, array, index): return getitem_pp = exe.run(prog, feed={x.name: array}, fetch_list=fetch_list) - print(getitem_pp) - self.assertTrue(np.array_equal(value_np, getitem_pp[0]), - msg='\n numpy:{},\n paddle:{}'.format( - value_np, getitem_pp[0])) + np.testing.assert_allclose(value_np, + getitem_pp[0], + rtol=1e-5, + atol=1e-8) def test_static_graph_getitem_bool_index(self): paddle.enable_static() @@ -791,9 +800,7 @@ def run_setitem_list_index(self, array, index, value_np): }, fetch_list=fetch_list) - self.assertTrue(np.allclose(array2, setitem_pp[0]), - msg='\n numpy:{},\n paddle:{}'.format( - array2, setitem_pp[0])) + np.testing.assert_allclose(array2, setitem_pp[0], rtol=1e-5, atol=1e-8) def test_static_graph_setitem_list_index(self): paddle.enable_static() From d2e59e155aa86b426b3cb0feb5990be77f74fc37 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Fri, 15 Jul 2022 14:14:26 +0800 Subject: [PATCH 216/250] Remove boost library (#44092) --- cmake/external/boost.cmake | 64 ------------------- cmake/inference_lib.cmake | 6 -- cmake/third_party.cmake | 21 ++---- .../distributed/ps/service/CMakeLists.txt | 18 ++---- .../ps/service/communicator/CMakeLists.txt | 8 +-- .../fluid/distributed/ps/table/CMakeLists.txt | 3 +- .../ps/table/memory_sparse_table.cc | 5 +- paddle/fluid/distributed/test/CMakeLists.txt | 16 ++--- paddle/fluid/framework/CMakeLists.txt | 12 ++-- .../fluid/framework/details/build_strategy.h | 1 - paddle/fluid/framework/io/CMakeLists.txt | 2 +- .../framework/paddle2cinn/CMakeLists.txt | 2 +- paddle/fluid/inference/lite/CMakeLists.txt | 4 +- .../fluid/inference/tensorrt/CMakeLists.txt | 6 +- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/operators/dlnne/CMakeLists.txt | 1 - .../operators/string/faster_tokenizer_op.cc | 1 - paddle/fluid/platform/CMakeLists.txt | 10 +-- .../fluid/platform/device/mlu/CMakeLists.txt | 2 +- .../fluid/platform/device/npu/CMakeLists.txt | 2 +- paddle/fluid/platform/stream/CMakeLists.txt | 2 +- paddle/infrt/CMakeLists.txt | 4 +- paddle/phi/kernels/autotune/CMakeLists.txt | 5 +- 23 files changed, 45 insertions(+), 152 deletions(-) delete mode 100644 cmake/external/boost.cmake diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake deleted file mode 100644 index 810796831e23e..0000000000000 --- a/cmake/external/boost.cmake +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -include(ExternalProject) - -set(BOOST_PROJECT "extern_boost") -# To release PaddlePaddle as a pip package, we have to follow the -# manylinux1 standard, which features as old Linux kernels and -# compilers as possible and recommends CentOS 5. Indeed, the earliest -# CentOS version that works with NVIDIA CUDA is CentOS 6. And a new -# version of boost, say, 1.66.0, doesn't build on CentOS 6. We -# checked that the devtools package of CentOS 6 installs boost 1.41.0. -# So we use 1.41.0 here. -set(BOOST_VER "1.41.0") -# boost_1_41_0_2021_10.tar.gz is almost the same with boost_1_41_0.tar.gz, -# except in visualc.hpp i comment a warning of "unknown compiler version", -# so if you need to change boost, you may need to block the warning similarly. -set(BOOST_TAR - "boost_1_41_0_2021_10" - CACHE STRING "" FORCE) -set(BOOST_URL - "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" - CACHE STRING "" FORCE) - -message(STATUS "BOOST_VERSION: ${BOOST_VER}, BOOST_URL: ${BOOST_URL}") - -set(BOOST_PREFIX_DIR ${THIRD_PARTY_PATH}/boost) -set(BOOST_INCLUDE_DIR - "${THIRD_PARTY_PATH}/boost/src/extern_boost" - CACHE PATH "boost include directory." FORCE) -set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) -include_directories(${BOOST_INCLUDE_DIR}) - -if(WIN32 AND MSVC_VERSION GREATER_EQUAL 1600) - add_definitions(-DBOOST_HAS_STATIC_ASSERT) -endif() - -ExternalProject_Add( - ${BOOST_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${BOOST_URL} - URL_MD5 51be7cc203628dc0848e97eee32d79e3 - PREFIX ${BOOST_PREFIX_DIR} - DOWNLOAD_NO_PROGRESS 1 - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - UPDATE_COMMAND "") - -add_library(boost INTERFACE) - -add_dependencies(boost ${BOOST_PROJECT}) -set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 865dd8643d867..2fc1be2545ddc 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -471,12 +471,6 @@ copy( ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported) -set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/boost") -copy( - inference_lib_dist - SRCS ${BOOST_INCLUDE_DIR}/boost - DSTS ${dst_dir}) - set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/dlpack") copy( inference_lib_dist diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 3cefa0dfa26a2..dd8013d807b39 100755 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -246,7 +246,6 @@ endif() include(external/zlib) # download, build, install zlib include(external/gflags) # download, build, install gflags include(external/glog) # download, build, install glog -include(external/boost) # download boost include(external/eigen) # download eigen3 include(external/threadpool) # download threadpool include(external/dlpack) # download dlpack @@ -254,14 +253,8 @@ include(external/xxhash) # download, build, install xxhash include(external/warpctc) # download, build, install warpctc include(external/utf8proc) # download, build, install utf8proc -list( - APPEND - third_party_deps - extern_eigen3 - extern_gflags - extern_glog - extern_boost - extern_xxhash) +list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog + extern_xxhash) list( APPEND third_party_deps @@ -272,14 +265,8 @@ list( extern_utf8proc) include(external/lapack) # download, build, install lapack -list( - APPEND - third_party_deps - extern_eigen3 - extern_gflags - extern_glog - extern_boost - extern_xxhash) +list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog + extern_xxhash) list( APPEND third_party_deps diff --git a/paddle/fluid/distributed/ps/service/CMakeLists.txt b/paddle/fluid/distributed/ps/service/CMakeLists.txt index ad49b651e2e71..709d11f7fbb84 100755 --- a/paddle/fluid/distributed/ps/service/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/service/CMakeLists.txt @@ -86,35 +86,29 @@ cc_library( cc_library( downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc - DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) + DEPS eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) cc_library( downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc ps_local_client.cc - DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) + DEPS eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) cc_library( client SRCS ps_client.cc - DEPS downpour_client boost ${RPC_DEPS}) + DEPS downpour_client ${RPC_DEPS}) cc_library( server SRCS server.cc - DEPS downpour_server boost ${RPC_DEPS}) + DEPS downpour_server ${RPC_DEPS}) cc_library( communicator SRCS communicator/communicator.cc - DEPS scope - client - boost - table - math_function - selected_rows_functor - ${RPC_DEPS}) + DEPS scope client table math_function selected_rows_functor ${RPC_DEPS}) cc_library( ps_service SRCS ps_service/service.cc - DEPS communicator client server boost ${RPC_DEPS}) + DEPS communicator client server ${RPC_DEPS}) cc_library( heter_client diff --git a/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt index 612358c71a6fb..03244ecba7b4a 100644 --- a/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/service/communicator/CMakeLists.txt @@ -6,10 +6,4 @@ set_source_files_properties( cc_library( communicator SRCS communicator.cc - DEPS scope - client - boost - table - math_function - selected_rows_functor - ${RPC_DEPS}) + DEPS scope client table math_function selected_rows_functor ${RPC_DEPS}) diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt index fdda59420f03c..3a9933cabdd7c 100644 --- a/paddle/fluid/distributed/ps/table/CMakeLists.txt +++ b/paddle/fluid/distributed/ps/table/CMakeLists.txt @@ -119,7 +119,6 @@ cc_library( string_helper device_context gflags - glog - boost) + glog) target_link_libraries(table -fopenmp) diff --git a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc index 60f012441c65c..115f8bcf58eaf 100644 --- a/paddle/fluid/distributed/ps/table/memory_sparse_table.cc +++ b/paddle/fluid/distributed/ps/table/memory_sparse_table.cc @@ -18,7 +18,6 @@ #include -#include "boost/lexical_cast.hpp" #include "glog/logging.h" #include "paddle/fluid/distributed/common/cost_timer.h" #include "paddle/fluid/framework/io/fs.h" @@ -530,7 +529,7 @@ int32_t MemorySparseTable::PullSparsePtr(char** pull_values, mf_value_size]() -> int { auto& keys = task_keys[shard_id]; auto& local_shard = _local_shards[shard_id]; - float data_buffer[value_size]; + float data_buffer[value_size]; // NOLINT float* data_buffer_ptr = data_buffer; for (size_t i = 0; i < keys.size(); ++i) { uint64_t key = keys[i].first; @@ -549,7 +548,7 @@ int32_t MemorySparseTable::PullSparsePtr(char** pull_values, ret = itr.value_ptr(); } int pull_data_idx = keys[i].second; - pull_values[pull_data_idx] = (char*)ret; + pull_values[pull_data_idx] = (char*)ret; // NOLINT } return 0; }); diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index 9b7a304b0a92a..16681ea77bbea 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -32,7 +32,6 @@ cc_test( client communicator ps_service - boost table ps_framework_proto ${COMMON_DEPS}) @@ -48,7 +47,6 @@ cc_test( client communicator ps_service - boost table ps_framework_proto ${COMMON_DEPS}) @@ -71,7 +69,6 @@ cc_test( client communicator ps_service - boost table ps_framework_proto ${COMMON_DEPS}) @@ -87,7 +84,6 @@ cc_test( client communicator ps_service - boost table ps_framework_proto ${COMMON_DEPS}) @@ -105,28 +101,28 @@ set_source_files_properties( cc_test( feature_value_test SRCS feature_value_test.cc - DEPS ${COMMON_DEPS} boost table) + DEPS ${COMMON_DEPS} table) set_source_files_properties( sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test( sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc - DEPS ${COMMON_DEPS} boost table) + DEPS ${COMMON_DEPS} table) set_source_files_properties( ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test( ctr_accessor_test SRCS ctr_accessor_test.cc - DEPS ${COMMON_DEPS} boost table) + DEPS ${COMMON_DEPS} table) set_source_files_properties( ctr_dymf_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test( ctr_dymf_accessor_test SRCS ctr_dymf_accessor_test.cc - DEPS ${COMMON_DEPS} boost table) + DEPS ${COMMON_DEPS} table) set_source_files_properties( memory_sparse_table_test.cc PROPERTIES COMPILE_FLAGS @@ -134,11 +130,11 @@ set_source_files_properties( cc_test( memory_sparse_table_test SRCS memory_sparse_table_test.cc - DEPS ${COMMON_DEPS} boost table) + DEPS ${COMMON_DEPS} table) set_source_files_properties( memory_geo_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test( memory_sparse_geo_table_test SRCS memory_geo_table_test.cc - DEPS ${COMMON_DEPS} boost table) + DEPS ${COMMON_DEPS} table) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 2aaa0c96e0a33..bd70e55ac45c4 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -45,7 +45,7 @@ proto_library(op_def_proto SRCS op_def.proto DEPS framework_proto) cc_library( op_def_api SRCS op_def_api.cc - DEPS op_def_proto boost) + DEPS op_def_proto) file(GLOB OP_DEF_FILES ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt) @@ -341,7 +341,7 @@ cc_library( cc_library( attribute SRCS attribute.cc - DEPS framework_proto boost enforce) + DEPS framework_proto enforce) cc_test( attribute_test SRCS attribute_test.cc @@ -354,12 +354,12 @@ cc_test( cc_library( op_version_proto SRCS op_version_proto.cc - DEPS framework_proto boost) + DEPS framework_proto) cc_library( op_version_registry SRCS op_version_registry.cc - DEPS op_version_proto framework_proto boost) + DEPS op_version_proto framework_proto) cc_test( op_version_registry_test SRCS op_version_registry_test.cc @@ -519,7 +519,7 @@ cc_test( cc_library( program_processing SRCS program_processing.cc - DEPS boost proto_desc) + DEPS proto_desc) cc_test( program_processing_test SRCS program_processing_test.cc @@ -1025,7 +1025,7 @@ endif() cc_library( prune SRCS prune.cc - DEPS framework_proto boost) + DEPS framework_proto) cc_test( prune_test SRCS prune_test.cc diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index baae0922ccd5d..1e27e381500aa 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -21,7 +21,6 @@ #include #include -#include "boost/optional.hpp" #include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/framework/io/CMakeLists.txt b/paddle/fluid/framework/io/CMakeLists.txt index 4d21c6a892349..28644d530854f 100644 --- a/paddle/fluid/framework/io/CMakeLists.txt +++ b/paddle/fluid/framework/io/CMakeLists.txt @@ -5,7 +5,7 @@ cc_library( cc_library( fs SRCS fs.cc - DEPS string_helper glog boost enforce shell) + DEPS string_helper glog enforce shell) cc_test( test_fs diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt index 7cb9cf254fb1a..5b8e62d4f079d 100644 --- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -1,7 +1,7 @@ cc_library( cinn_cache_key SRCS cinn_cache_key.cc - DEPS boost graph graph_helper lod_tensor proto_desc) + DEPS graph graph_helper lod_tensor proto_desc) cc_library( build_cinn_pass SRCS build_cinn_pass.cc diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt index 7aa010cb0066c..3f4992b8946ec 100644 --- a/paddle/fluid/inference/lite/CMakeLists.txt +++ b/paddle/fluid/inference/lite/CMakeLists.txt @@ -5,7 +5,7 @@ endif() cc_library( lite_op_teller SRCS op_teller.cc - DEPS ${LITE_DEPS} framework_proto device_context boost xxhash) + DEPS ${LITE_DEPS} framework_proto device_context xxhash) cc_library( lite_engine SRCS engine.cc @@ -13,7 +13,7 @@ cc_library( cc_library( lite_tensor_utils SRCS tensor_utils.cc - DEPS memcpy ${LITE_DEPS} framework_proto boost device_context ${XPU_DEPS}) + DEPS memcpy ${LITE_DEPS} framework_proto device_context ${XPU_DEPS}) cc_test( test_lite_engine SRCS test_engine_lite.cc diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index cd03dce1795e2..7239b506d33f6 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -4,18 +4,18 @@ if(WIN32) nv_library( tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc - DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost + DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context paddle_inference_api) else() nv_library( tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc - DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost) + DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) endif() nv_library( tensorrt_op_teller SRCS op_teller.cc - DEPS framework_proto device_context boost) + DEPS framework_proto device_context) nv_test( test_tensorrt SRCS test_tensorrt.cc diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 809ad5174b60b..893f7d51140a7 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -172,7 +172,7 @@ sequence_pooling executor device_memory_aligment generator) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc_functor matrix_inverse matrix_solve) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function) if (WITH_GPU OR WITH_ROCM) diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt index 11347f0f94e5c..a2aa80f2875b8 100644 --- a/paddle/fluid/operators/dlnne/CMakeLists.txt +++ b/paddle/fluid/operators/dlnne/CMakeLists.txt @@ -39,7 +39,6 @@ op_library( DEPS ${GLOB_OPERATOR_DEPS} framework_proto - boost device_context op_registry scope) diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc index 4c74b20bf9378..3539e2213a39d 100644 --- a/paddle/fluid/operators/string/faster_tokenizer_op.cc +++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include #include -#include #include #include #include diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 2374cfdfd3426..e872fb162530f 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -37,7 +37,7 @@ endif() cc_library( flags SRCS flags.cc - DEPS gflags boost) + DEPS gflags) cc_library( denormal SRCS denormal.cc @@ -48,7 +48,7 @@ cc_test( SRCS errors_test.cc DEPS errors enforce) -set(enforce_deps flags errors boost flags phi_enforce) +set(enforce_deps flags errors flags phi_enforce) if(WITH_GPU) set(enforce_deps ${enforce_deps} external_error_proto) endif() @@ -99,7 +99,7 @@ endif() cc_library( place SRCS place.cc - DEPS enforce boost phi_place) + DEPS enforce phi_place) cc_test( place_test SRCS place_test.cc @@ -185,7 +185,7 @@ endif() cc_library( cudnn_workspace_helper SRCS cudnn_workspace_helper.cc - DEPS boost) + DEPS) # separate init from device_context to avoid cycle dependencies cc_library( @@ -372,7 +372,7 @@ add_subdirectory(profiler) cc_library( device_tracer SRCS device_tracer.cc - DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) + DEPS profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) nv_library( profiler diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt index 08b33c9b58f06..43a8f17504750 100644 --- a/paddle/fluid/platform/device/mlu/CMakeLists.txt +++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt @@ -13,7 +13,7 @@ cc_library( cc_library( mlu_stream SRCS mlu_stream.cc - DEPS boost mlu_info stream_callback_manager eigen3 ${MKLDNN_CTX_DEPS}) + DEPS mlu_info stream_callback_manager eigen3 ${MKLDNN_CTX_DEPS}) cc_library( mlu_device_context SRCS device_context.cc diff --git a/paddle/fluid/platform/device/npu/CMakeLists.txt b/paddle/fluid/platform/device/npu/CMakeLists.txt index 9015a76e9cd5a..417b0f9ab6e1a 100644 --- a/paddle/fluid/platform/device/npu/CMakeLists.txt +++ b/paddle/fluid/platform/device/npu/CMakeLists.txt @@ -21,7 +21,7 @@ if(WITH_ASCEND_CL) cc_library( npu_stream SRCS npu_stream.cc - DEPS enforce boost stream_callback_manager) + DEPS enforce stream_callback_manager) cc_library( npu_collective_helper SRCS npu_collective_helper.cc diff --git a/paddle/fluid/platform/stream/CMakeLists.txt b/paddle/fluid/platform/stream/CMakeLists.txt index 25d2874ca04d2..32c1857bf6903 100644 --- a/paddle/fluid/platform/stream/CMakeLists.txt +++ b/paddle/fluid/platform/stream/CMakeLists.txt @@ -2,5 +2,5 @@ if(WITH_GPU OR WITH_ROCM) cc_library( cuda_stream SRCS cuda_stream.cc - DEPS enforce boost eigen3 ${MKLDNN_CTX_DEPS}) + DEPS enforce eigen3 ${MKLDNN_CTX_DEPS}) endif() diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index 3846acbde4819..a19fb2d0a8ed9 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -128,11 +128,11 @@ endif() cc_library( infrt SHARED SRCS ${infrt_src} - DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive) + DEPS glog ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive) cc_library( infrt_static SRCS ${infrt_src} - DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto) + DEPS glog ${mlir_libs} ${phi_libs} paddle_framework_proto) add_dependencies(infrt ${infrt_mlir_incs} mlir-headers) add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS}) diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt index a7a6c2f8e4dc0..9379c78c8d005 100644 --- a/paddle/phi/kernels/autotune/CMakeLists.txt +++ b/paddle/phi/kernels/autotune/CMakeLists.txt @@ -18,10 +18,7 @@ elseif(WITH_ROCM) DEPS gtest) endif() -cc_library( - cache - SRCS cache.cc - DEPS boost) +cc_library(cache SRCS cache.cc) cc_library( switch_autotune SRCS switch_autotune.cc From d881d690e970b8be8d10f659db535c02a168793a Mon Sep 17 00:00:00 2001 From: RichardWooSJTU <37864677+RichardWooSJTU@users.noreply.github.com> Date: Fri, 15 Jul 2022 14:38:20 +0800 Subject: [PATCH 217/250] add fused token prune op and plugin (#44281) * add fused token prune op and plugin --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 3 +- .../tensorrt/convert/fused_token_prune_op.cc | 76 +++ paddle/fluid/inference/tensorrt/op_teller.cc | 3 +- .../inference/tensorrt/plugin/CMakeLists.txt | 10 +- .../plugin/fused_token_prune_op_plugin.cu | 527 ++++++++++++++++++ .../plugin/fused_token_prune_op_plugin.h | 159 ++++++ .../plugin/test_fused_token_prune_plugin.cc | 48 ++ .../inference/tensorrt/test_dynamic_engine.cc | 192 +++++++ .../fluid/operators/fused_token_prune_op.cc | 187 +++++++ .../fluid/operators/fused_token_prune_op.cu | 287 ++++++++++ .../fluid/operators/fused_token_prune_op.cu.h | 50 ++ .../unittests/ir/inference/CMakeLists.txt | 7 + .../test_trt_convert_fused_token_prune.py | 129 +++++ .../unittests/test_fused_token_prune_op.py | 112 ++++ tools/static_mode_white_list.py | 1 + 16 files changed, 1789 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc create mode 100644 paddle/fluid/operators/fused_token_prune_op.cc create mode 100644 paddle/fluid/operators/fused_token_prune_op.cu create mode 100644 paddle/fluid/operators/fused_token_prune_op.cu.h create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fused_token_prune.py create mode 100644 python/paddle/fluid/tests/unittests/test_fused_token_prune_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d008355e0ed5b..5e787394bce25 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2089,6 +2089,7 @@ USE_TRT_CONVERTER(top_k) USE_TRT_CONVERTER(top_k_v2) USE_TRT_CONVERTER(squeeze2) USE_TRT_CONVERTER(unsqueeze2) +USE_TRT_CONVERTER(fused_token_prune) #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000) USE_TRT_CONVERTER(sparse_fc) USE_TRT_CONVERTER(sparse_multihead_matmul) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 90089fcbfd806..ca91df902a9a1 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -68,7 +68,8 @@ list( c_allreduce_op.cc top_k_op.cc squeeze2_op.cc - unsqueeze2_op.cc) + unsqueeze2_op.cc + fused_token_prune_op.cc) if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND CONVERT_FILES sparse_fc_op.cc sparse_multihead_matmul_op.cc) diff --git a/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc new file mode 100644 index 0000000000000..bab04ac16aac9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/fused_token_prune_op.cc @@ -0,0 +1,76 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class FusedTokenPruneOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { + framework::OpDesc op_desc(op, nullptr); + nvinfer1::ILayer* layer = nullptr; + + auto* Attn = engine_->GetITensor(op_desc.Input("Attn").front()); + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + auto* Mask = engine_->GetITensor(op_desc.Input("Mask").front()); + auto* NewMask = engine_->GetITensor(op_desc.Input("NewMask").front()); + bool keep_first_token = + op_desc.HasAttr("keep_first_token") + ? BOOST_GET_CONST(bool, op_desc.GetAttr("keep_first_token")) + : true; + bool keep_order = op_desc.HasAttr("keep_order") + ? BOOST_GET_CONST(bool, op_desc.GetAttr("keep_order")) + : false; + + std::vector itensors = {Attn, X, Mask, NewMask}; + + auto output_name = op_desc.Output("SlimmedX")[0]; + auto out_inds_name = op_desc.Output("CLSInds")[0]; + if (engine_->with_dynamic_shape()) { +#if IS_TRT_VERSION_GE(6000) + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + + if (engine_->precision() == AnalysisConfig::Precision::kInt8) { + with_fp16 = true; + } + plugin::FusedTokenPrunePluginDynamic* plugin = + new plugin::FusedTokenPrunePluginDynamic( + with_fp16, keep_first_token, keep_order); + layer = engine_->AddDynamicPlugin(itensors.data(), 4, plugin); +#else + PADDLE_THROW(platform::errors::Fatal( + "You are running the TRT Dynamic Shape mode, need to confirm that " + "your TRT version is no less than 6.0")); +#endif + } else { + PADDLE_THROW(platform::errors::Fatal( + "You are running the Ernie(Bert) model in static shape mode, which " + "is not supported for the time being.\n" + "You can use the config.SetTRTDynamicShapeInfo(...) interface to set " + "the shape information to run the dynamic shape mode.")); + } + RreplenishLayerAndOutput( + layer, "fused_token_prune", {output_name, out_inds_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(fused_token_prune, FusedTokenPruneOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 89019835a65fd..eaef331356575 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -275,7 +275,8 @@ struct SimpleOpTypeSetTeller : public Teller { "recover_padding", "remove_padding", "squeeze2", - "unsqueeze2"}; + "unsqueeze2", + "fused_token_prune"}; }; bool OpTeller::Tell(const framework::ir::Node* node, diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index cd65316fb4a63..90344fc0adae8 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -29,7 +29,8 @@ list( remove_padding_plugin.cu recover_padding_plugin.cu c_allreduce_op_plugin.cu - preln_residual_bias_plugin.cu) + preln_residual_bias_plugin.cu + fused_token_prune_op_plugin.cu) if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8) list(APPEND TRT_FILES spmm_plugin.cu) @@ -44,3 +45,10 @@ nv_test( test_split_plugin SRCS test_split_plugin.cc DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin) + +if(NOT WIN32) + nv_test( + test_fused_token_prune_plugin + SRCS test_fused_token_prune_plugin.cc + DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin) +endif() diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu new file mode 100644 index 0000000000000..627ef44e6fd75 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu @@ -0,0 +1,527 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "cub/cub.cuh" + +#include "paddle/phi/kernels/funcs/math_function.h" + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/fluid/platform/device_context.h" + +#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h" +#include "paddle/fluid/operators/fused_token_prune_op.cu.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#if IS_TRT_VERSION_GE(6000) + +template +__global__ void ElementwiseMask(const T* a, + const T* b, + T* res, + int num_elements) { + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= num_elements) return; + const T zero = 0; + res[tid] = b[tid] >= zero ? a[tid] : zero; +} + +template +__global__ void FillZero(T* data, int len) { + auto tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= len) return; + const T zero = 0; + data[tid] = zero; +} + +__global__ void FillIndex(int32_t* indices, int num_raws, int num_cols) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= num_raws * num_cols) return; + + int col = tid % num_cols; + int raw = tid / num_cols; + + indices[tid] = col; +} + +template +__global__ void MaximumFirst(T* mat, int num_raws, int num_cols, T max_value) { + auto raw = blockIdx.x * blockDim.x + threadIdx.x; + if (raw >= num_raws) return; + mat[raw * num_cols] = max_value; +} + +__global__ void FillOffsets(int* offsets, int num_raws, int num_cols) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid > num_raws) return; + + offsets[tid] = tid * num_cols; +} + +template +__global__ void Slice( + const T* src, T* dst, int num_raws, int src_num_cols, int dst_num_cols) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= num_raws * dst_num_cols) return; + int raw = tid / dst_num_cols; + int col = tid % dst_num_cols; + dst[tid] = src[raw * src_num_cols + col]; +} + +template +__global__ void ReduceSum2( + const T* src, T* dst, int bsz, int nb_head, int max_seq_len) { + int tid = threadIdx.x; + int bid = blockIdx.x; + int num_blocks_per_head = ((max_seq_len / blockDim.x) * max_seq_len); + int batch = bid / (nb_head * num_blocks_per_head); + int col = bid % max_seq_len; + int head = (bid / num_blocks_per_head) % nb_head; + + extern __shared__ T res_float[]; + res_float[tid] = + src[batch * (nb_head * max_seq_len * max_seq_len) + + head * (max_seq_len * max_seq_len) + col + tid * max_seq_len]; + __syncthreads(); + + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { + if (tid < offset) { + res_float[tid] += res_float[tid + offset]; + } + __syncthreads(); + if (offset % 2 == 1 && tid == offset - 2) { + res_float[tid] += res_float[tid + 1]; + } + } + + if (tid == 0) { + auto* dst_addr = dst + batch * max_seq_len + col; + atomicAdd(dst_addr, res_float[0]); + } +} + +template <> +__global__ void ReduceSum2( + const half* src, half* dst, int bsz, int nb_head, int max_seq_len) { + int tid = threadIdx.x; + int bid = blockIdx.x; + int num_blocks_per_head = ((max_seq_len / blockDim.x) * max_seq_len); + int batch = bid / (nb_head * num_blocks_per_head); + int col = bid % max_seq_len; + int head = (bid / num_blocks_per_head) % nb_head; + + extern __shared__ half res_half[]; + res_half[tid] = + src[batch * (nb_head * max_seq_len * max_seq_len) + + head * (max_seq_len * max_seq_len) + col + tid * max_seq_len]; + __syncthreads(); + + for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) { + if (tid < offset) { + res_half[tid] += res_half[tid + offset]; + } + __syncthreads(); + if (offset % 2 == 1 && tid == offset - 2) { + res_half[tid] += res_half[tid + 1]; + } + __syncthreads(); + } + + if (tid == 0) { + platform::fastAtomicAdd( + reinterpret_cast(dst), + static_cast(batch * max_seq_len + col), + static_cast(bsz * max_seq_len), + static_cast(res_half[0])); + } +} + +template +__global__ void TakeAlongAxis(const T* src, + T* dst, + int32_t* indices, + int num_raws, + int src_num_cols, + int dst_num_cols, + int num_elements) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid >= num_raws * dst_num_cols) return; + + int raw = tid / dst_num_cols; + int col = tid % dst_num_cols; + for (int i = 0; i < num_elements; ++i) { + dst[tid * num_elements + i] = + *(src + (raw * src_num_cols + indices[tid]) * num_elements + i); + } +} + +nvinfer1::DimsExprs FusedTokenPrunePluginDynamic::getOutputDimensions( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT { + auto x_dims = inputs[1], new_mask_dims = inputs[3]; + if (output_index == 0) { + nvinfer1::DimsExprs ret = x_dims; + ret.d[1] = new_mask_dims.d[2]; + return ret; + } else { + nvinfer1::DimsExprs ret; + ret.nbDims = 2; + ret.d[0] = new_mask_dims.d[0]; + ret.d[1] = new_mask_dims.d[2]; + return ret; + } +} + +bool FusedTokenPrunePluginDynamic::supportsFormatCombination( + int pos, + const nvinfer1::PluginTensorDesc* in_out, + int nb_inputs, + int nb_outputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_NOT_NULL( + in_out, + platform::errors::InvalidArgument( + "The input of swish plugin shoule not be nullptr.")); + + PADDLE_ENFORCE_LT( + pos, + nb_inputs + nb_outputs, + platform::errors::InvalidArgument("The pos(%d) should be less than the " + "num(%d) of the input and the output.", + pos, + nb_inputs + nb_outputs)); + + const nvinfer1::PluginTensorDesc& in = in_out[pos]; + if (pos == 0) { + if (with_fp16_) { +#ifdef TRT_PLUGIN_FP16_AVALIABLE + return (in.type == nvinfer1::DataType::kFLOAT || + in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::TensorFormat::kLINEAR); +#else + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); +#endif + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } + } else if (pos <= 4) { + const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; + return in.type == prev.type && in.format == prev.format; + } else { + const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; + return in.type == nvinfer1::DataType::kINT32 && in.format == prev.format; + } +} + +nvinfer1::DataType FusedTokenPrunePluginDynamic::getOutputDataType( + int index, + const nvinfer1::DataType* input_types, + int nb_inputs) const TRT_NOEXCEPT { + if (index == 0) { + return input_types[1]; + } else if (index == 1) { + return nvinfer1::DataType::kINT32; + } +} + +size_t FusedTokenPrunePluginDynamic::getWorkspaceSize( + const nvinfer1::PluginTensorDesc* inputs, + int nb_inputs, + const nvinfer1::PluginTensorDesc* outputs, + int nb_outputs) const TRT_NOEXCEPT { + auto attn_dims = inputs[0].dims; + auto x_dims = inputs[1].dims; + auto new_mask_dims = inputs[3].dims; + auto bsz = attn_dims.d[0], nb_head = attn_dims.d[1], + max_seq_len = attn_dims.d[2]; + + int slimmed_x_len = new_mask_dims.d[2]; + int total = bsz * nb_head * max_seq_len * max_seq_len; + size_t size = total * sizeof(float); + size += bsz * max_seq_len * sizeof(float); + size += bsz * max_seq_len * sizeof(int32_t); + size += bsz * max_seq_len * sizeof(float); + size += bsz * max_seq_len * sizeof(int32_t); + size += (bsz + 1) * sizeof(int); + size += bsz * slimmed_x_len * sizeof(int32_t); + return size; +} + +template +int FusedTokenPrunePluginDynamic::enqueueImpl( + const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, + const void* const* inputs, + void* const* outputs, + void* workspace_ptr, + cudaStream_t stream, + int device_id, + T max_value) { + // Dims + auto attn_dims = input_desc[0].dims; + auto x_dims = input_desc[1].dims; + auto new_mask_dims = input_desc[3].dims; + + auto bsz = attn_dims.d[0], nb_head = attn_dims.d[1], + max_seq_len = attn_dims.d[2]; + auto c = x_dims.d[2]; + auto slimmed_x_len = new_mask_dims.d[2]; + + // Inputs + const T* attn_data = static_cast(inputs[0]); + const T* x_data = static_cast(inputs[1]); + const T* mask_data = static_cast(inputs[2]); + + // Outputs + T* output_data = static_cast(outputs[0]); + int32_t* output_indices_data = static_cast(outputs[1]); + + int total = bsz * nb_head * max_seq_len * max_seq_len; + int block = operators::ComputeBlockSize(max_seq_len); + int grid = operators::CeilDivide(total, block); + + // Workspace for intermediate variable + char* workspace = static_cast(workspace_ptr); + T* attn_tmp_data = reinterpret_cast(workspace); + size_t offset = total * sizeof(T); + T* attn_accu_data = reinterpret_cast(workspace + offset); + offset += bsz * max_seq_len * sizeof(T); + int32_t* attn_accu_indices_data = + reinterpret_cast(workspace + offset); + offset += bsz * max_seq_len * sizeof(int32_t); + T* sort_attn_accu_data = reinterpret_cast(workspace + offset); + offset += bsz * max_seq_len * sizeof(T); + int32_t* sort_attn_accu_indices_data = + reinterpret_cast(workspace + offset); + offset += bsz * max_seq_len * sizeof(int32_t); + int* offsets_data = reinterpret_cast(workspace + offset); + offset += (bsz + 1) * sizeof(int); + int32_t* slimmed_sort_attn_accu_indices_data = + reinterpret_cast(workspace + offset); + + // 1. Filter attn by mask + ElementwiseMask + <<>>(attn_data, mask_data, attn_tmp_data, total); + + total = bsz * max_seq_len; + block = operators::ComputeBlockSize(max_seq_len); + grid = operators::CeilDivide(total, block); + FillZero<<>>(attn_accu_data, total); + + // 2. Reduce sum + total = bsz * nb_head * max_seq_len * max_seq_len; + int block_tmp = max_seq_len; + while (block_tmp > 1024) + block_tmp /= 2; // if max seq len > 1024, it must be 2^n + block = + block_tmp; // make sure max_seq_len is an integral multiple of block_size + grid = operators::CeilDivide(total, block); + ReduceSum2<<>>( + attn_tmp_data, attn_accu_data, bsz, nb_head, max_seq_len); + + // 3. Prepare token indices + total = bsz * max_seq_len; + block = operators::ComputeBlockSize(max_seq_len); + grid = operators::CeilDivide(total, block); + + FillIndex<<>>( + attn_accu_indices_data, bsz, max_seq_len); + + // 4. Sort token indices by attn + if (keep_first_token_) { + MaximumFirst + <<>>(attn_accu_data, bsz, max_seq_len, max_value); + } + size_t temp_storage_bytes = -1; + int num_items = bsz * max_seq_len; + int num_segments = bsz; + FillOffsets<<>>(offsets_data, bsz, max_seq_len); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortPairsDescending( + nullptr, + temp_storage_bytes, + attn_accu_data, + sort_attn_accu_data, + attn_accu_indices_data, + sort_attn_accu_indices_data, + num_items, + num_segments, + offsets_data, + offsets_data + 1, + 0, + sizeof(T) * 8, + stream)); + int64_t temp_size = temp_storage_bytes; + framework::Tensor temp_storage; + auto* temp_storage_data = temp_storage.mutable_data( + {temp_size}, platform::CUDAPlace(device_id)); + + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortPairsDescending( + temp_storage_data, + temp_storage_bytes, + attn_accu_data, + sort_attn_accu_data, + attn_accu_indices_data, + sort_attn_accu_indices_data, + num_items, + num_segments, + offsets_data, + offsets_data + 1, + 0, + sizeof(T) * 8, + stream)); + // 5. Slice + total = bsz * slimmed_x_len; + block = operators::ComputeBlockSize(slimmed_x_len); + grid = operators::CeilDivide(total, block); + + Slice + <<>>(sort_attn_accu_indices_data, + slimmed_sort_attn_accu_indices_data, + bsz, + max_seq_len, + slimmed_x_len); + + if (keep_order_) { + // 6. reorder + num_items = bsz * slimmed_x_len; + FillOffsets<<>>(offsets_data, bsz, slimmed_x_len); + temp_storage_bytes = -1; + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortKeys( + nullptr, + temp_storage_bytes, + slimmed_sort_attn_accu_indices_data, + output_indices_data, + num_items, + num_segments, + offsets_data, + offsets_data + 1, + 0, + sizeof(int32_t) * 8, + stream)); + + temp_size = temp_storage_bytes; + temp_storage.Resize({temp_size}); + temp_storage_data = + temp_storage.mutable_data(platform::CUDAPlace(device_id)); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortKeys( + temp_storage_data, + temp_storage_bytes, + slimmed_sort_attn_accu_indices_data, + output_indices_data, + num_items, + num_segments, + offsets_data, + offsets_data + 1, + 0, + sizeof(int32_t) * 8, + stream)); + + TakeAlongAxis<<>>(x_data, + output_data, + output_indices_data, + bsz, + max_seq_len, + slimmed_x_len, + c); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(output_indices_data, + slimmed_sort_attn_accu_indices_data, + bsz * slimmed_x_len * sizeof(int32_t), + cudaMemcpyDeviceToDevice)); + TakeAlongAxis + <<>>(x_data, + output_data, + slimmed_sort_attn_accu_indices_data, + bsz, + max_seq_len, + slimmed_x_len, + c); + } + + return cudaGetLastError() != cudaSuccess; +} + +int FusedTokenPrunePluginDynamic::enqueue( + const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, + const void* const* inputs, + void* const* outputs, + void* workspace, + cudaStream_t stream) TRT_NOEXCEPT { + auto input_type = input_desc[0].type; + auto attn_dims = input_desc[0].dims; + auto bsz = attn_dims.d[0], nb_head = attn_dims.d[1], + max_seq_len = attn_dims.d[2]; + int device_id; + cudaGetDevice(&device_id); + + if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp32"; + + float max = std::numeric_limits::max(); + + return enqueueImpl(input_desc, + output_desc, + inputs, + outputs, + workspace, + stream, + device_id, + max); + + } else if (input_type == nvinfer1::DataType::kHALF) { +#ifdef TRT_PLUGIN_FP16_AVALIABLE + VLOG(1) << "TRT Plugin DataType selected. FusedTokenPrune-->fp16"; + + half max = 65504.0; + + return enqueueImpl(input_desc, + output_desc, + inputs, + outputs, + workspace, + stream, + device_id, + max); + +#else + PADDLE_THROW(platform::errors::Fatal( + "The Ernie(Bert) TensorRT Plugin should be " + "complied with CUDA version >= 10.0 when running with fp16. " + "Please recomplie it or try to use fp32 by set " + "config.SetTRTDynamicShapeInfo(min_input_shape, " + "max_input_shape, opt_input_shape, true")); +#endif + } else { + PADDLE_THROW( + platform::errors::Fatal("The FusedTokenPrune TRT Plugin's input type " + "should be float or half.")); + } +} + +#endif +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h new file mode 100644 index 0000000000000..fcd91522ca39c --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h @@ -0,0 +1,159 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#if IS_TRT_VERSION_GE(6000) + +class FusedTokenPrunePluginDynamic : public DynamicPluginTensorRT { + public: + explicit FusedTokenPrunePluginDynamic(bool with_fp16, + bool keep_first_token, + bool keep_order) + : keep_first_token_(keep_first_token), keep_order_(keep_order) { + with_fp16_ = with_fp16; + } + FusedTokenPrunePluginDynamic(void const* serial_data, size_t serial_length) { + DeserializeValue(&serial_data, &serial_length, &with_fp16_); + DeserializeValue(&serial_data, &serial_length, &keep_first_token_); + DeserializeValue(&serial_data, &serial_length, &keep_order_); + } + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { + return new FusedTokenPrunePluginDynamic( + with_fp16_, keep_first_token_, keep_order_); + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "fused_token_prune_plugin_dynamic"; + } + int getNbOutputs() const TRT_NOEXCEPT override { return 2; } + int initialize() TRT_NOEXCEPT override { return 0; } + + size_t getSerializationSize() const TRT_NOEXCEPT override { + return SerializedSize(with_fp16_) + SerializedSize(keep_first_token_) + + SerializedSize(keep_order_); + } + void serialize(void* buffer) const TRT_NOEXCEPT override { + SerializeValue(&buffer, with_fp16_); + SerializeValue(&buffer, keep_first_token_); + SerializeValue(&buffer, keep_order_); + } + + nvinfer1::DimsExprs getOutputDimensions( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) // NOLINT + TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* in_out, + int nb_inputs, + int nb_outputs) TRT_NOEXCEPT override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nb_inputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nb_outputs) TRT_NOEXCEPT override {} + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nb_inputs, + const nvinfer1::PluginTensorDesc* outputs, + int nb_outputs) const TRT_NOEXCEPT override; + + int enqueue(const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, + const void* const* inputs, + void* const* outputs, + void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* input_types, + int nb_inputs) const + TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override { delete this; } + + private: + template + int enqueueImpl(const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, + const void* const* inputs, + void* const* outputs, + void* workspace, + cudaStream_t stream, + int device_id, + T max_value); + bool keep_first_token_; + bool keep_order_; +}; + +class FusedTokenPrunePluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + FusedTokenPrunePluginDynamicCreator() {} + const char* getPluginName() const TRT_NOEXCEPT override { + return "fused_token_prune_plugin_dynamic"; + } + + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin(const char* name, + const nvinfer1::PluginFieldCollection* fc) + TRT_NOEXCEPT override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) + TRT_NOEXCEPT override { + auto plugin = new FusedTokenPrunePluginDynamic(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const TRT_NOEXCEPT override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_; + std::vector plugin_attributes_; +}; +REGISTER_TRT_PLUGIN_V2(FusedTokenPrunePluginDynamicCreator); + +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc new file mode 100644 index 0000000000000..131ce46d89a66 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/test_fused_token_prune_plugin.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +TEST(fused_token_prune_op_plugin, test_plugin) { + FusedTokenPrunePluginDynamic plugin( + true, /*keep_first_token*/ false, /*keep_order*/ true); + plugin.configurePlugin(nullptr, 4, nullptr, 2); + plugin.initialize(); + plugin.getPluginType(); + plugin.getNbOutputs(); + auto clone_plugin = plugin.clone(); + clone_plugin->destroy(); + size_t buf_size = plugin.getSerializationSize(); + std::vector buf(buf_size); + plugin.serialize(buf.data()); +} + +TEST(fused_token_prune_op_plugin, test_plugin_creater) { + FusedTokenPrunePluginDynamicCreator creator; + creator.getFieldNames(); + creator.createPlugin("test", nullptr); + creator.setPluginNamespace("test"); +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc index eae1e2baf9ad1..8d95bbea5b89f 100644 --- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc @@ -22,6 +22,7 @@ limitations under the License. */ #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000) #include "paddle/fluid/inference/tensorrt/plugin/spmm_plugin.h" #endif +#include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/float16.h" @@ -195,6 +196,197 @@ TEST_F(TensorRTDynamicEngineTest, test_spmm) { return; } +class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test { + protected: + void SetUp() override { + ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(platform::CUDAPlace(0), ctx_->stream()) + .get()); + ctx_->SetHostAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CPUPlace()) + .get()); + ctx_->SetZeroAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetZeroAllocator(platform::CUDAPlace(0)) + .get()); + ctx_->SetPinnedAllocator( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(paddle::platform::CUDAPinnedPlace()) + .get()); + ctx_->PartialInitWithAllocator(); + + std::map> min_input_shape = { + {"attn", {4, 1, 4, 4}}, + {"x", {4, 4, 1}}, + {"mask", {4, 1, 4, 4}}, + {"new_mask", {4, 1, 2, 2}}}; + std::map> max_input_shape = { + {"attn", {4, 1, 4, 4}}, + {"x", {4, 4, 1}}, + {"mask", {4, 1, 4, 4}}, + {"new_mask", {4, 1, 2, 2}}}; + std::map> optim_input_shape = { + {"attn", {4, 1, 4, 4}}, + {"x", {4, 4, 1}}, + {"mask", {4, 1, 4, 4}}, + {"new_mask", {4, 1, 2, 2}}}; + + engine_ = new TensorRTEngine(16, + 1 << 10, + AnalysisConfig::Precision::kHalf, + nullptr, + 0, + min_input_shape, + max_input_shape, + optim_input_shape, + false, + phi::DataType::FLOAT32, + NaiveLogger::Global()); + engine_->InitNetwork(); + } + + void TearDown() override { + if (engine_) { + delete engine_; + engine_ = nullptr; + } + } + + void PrepareInputOutput(const std::vector> inputs, + std::vector> output_shapes) { + LOG(INFO) << "PrepareInputOutput"; + int num_inputs = inputs.size(); + int num_outputs = output_shapes.size(); + inputs_.resize(num_inputs); + outputs_.resize(num_outputs); + for (int i = 0; i < num_inputs; ++i) { + paddle::framework::TensorFromVector(inputs[i], *ctx_, &inputs_[i]); + } + for (int i = 0; i < num_outputs; ++i) { + outputs_[i].Resize(phi::make_ddim(output_shapes[i])); + } + } + + void GetOutput(std::vector &slimmed_x, // NOLINT + std::vector &cls_inds) { // NOLINT + paddle::framework::TensorToVector(outputs_[0], *ctx_, &slimmed_x); + paddle::framework::TensorToVector(outputs_[1], *ctx_, &cls_inds); + } + + protected: + std::vector inputs_; + std::vector outputs_; + TensorRTEngine *engine_; + platform::CUDADeviceContext *ctx_; +}; + +TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) { +#if IS_TRT_VERSION_GE(8000) + auto *attn = engine_->DeclareInput( + "attn", nvinfer1::DataType::kHALF, nvinfer1::Dims4{-1, 1, 4, 4}); + auto *x = engine_->DeclareInput( + "x", nvinfer1::DataType::kHALF, nvinfer1::Dims3{-1, 4, 1}); + auto *mask = engine_->DeclareInput( + "mask", nvinfer1::DataType::kHALF, nvinfer1::Dims4{-1, 1, 4, 4}); + auto *new_mask = engine_->DeclareInput( + "new_mask", nvinfer1::DataType::kHALF, nvinfer1::Dims4{-1, 1, 2, 2}); + plugin::FusedTokenPrunePluginDynamic *plugin = + new plugin::FusedTokenPrunePluginDynamic( + true, /*keep_first_token*/ false, /*keep_order*/ true); + std::vector itensors = {attn, x, mask, new_mask}; + auto *layer = engine_->AddDynamicPlugin(itensors.data(), 4, plugin); + PADDLE_ENFORCE_NOT_NULL(layer, + platform::errors::InvalidArgument( + "TRT fused_token_prune layer building failed.")); + std::vector output_tensor_names{"out_slimmed_x", "out_cls_inds"}; + for (size_t i = 0; i < 2; i++) { + layer->getOutput(i)->setName(output_tensor_names[i].c_str()); + engine_->DeclareOutput(layer, i, output_tensor_names[i]); + } + engine_->FreezeNetwork(); + + ASSERT_EQ(engine_->engine()->getNbBindings(), 6); + LOG(INFO) << "create input"; + std::vector attn_v(64); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + for (int k = 0; k < 4; ++k) { + attn_v[i * 16 + j * 4 + k] = k; + } + } + } + std::vector x_v(16); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + x_v[i * 4 + j] = 1; + } + } + std::vector mask_v(64); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + for (int k = 0; k < 4; ++k) { + mask_v[i * 16 + j * 4 + k] = 1; + } + } + } + std::vector new_mask_v(16); + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 2; ++k) { + new_mask_v[i * 4 + j * 2 + k] = 1; + } + } + } + + LOG(INFO) << "create output"; + std::vector out_slimmed_x_shape{4, 2, 1}; + std::vector out_cls_ins_shape{4, 2}; + + PrepareInputOutput({attn_v, x_v, mask_v, new_mask_v}, + {out_slimmed_x_shape, out_cls_ins_shape}); + + auto *attn_gpu_data = inputs_[0].mutable_data(ctx_->GetPlace()); + auto *x_gpu_data = inputs_[1].mutable_data(ctx_->GetPlace()); + auto *mask_gpu_data = inputs_[2].mutable_data(ctx_->GetPlace()); + auto *new_mask_gpu_data = inputs_[3].mutable_data(ctx_->GetPlace()); + + auto *slimmed_x_gpu_data = outputs_[0].mutable_data(ctx_->GetPlace()); + auto *cls_inds_gpu_data = outputs_[1].mutable_data(ctx_->GetPlace()); + + LOG(INFO) << "create buffers"; + + std::vector buffers(6); + buffers[0] = reinterpret_cast(attn_gpu_data); + buffers[1] = reinterpret_cast(x_gpu_data); + buffers[2] = reinterpret_cast(mask_gpu_data); + buffers[3] = reinterpret_cast(new_mask_gpu_data); + buffers[4] = reinterpret_cast(slimmed_x_gpu_data); + buffers[5] = reinterpret_cast(cls_inds_gpu_data); + + LOG(INFO) << "Execute"; + + engine_->Execute(4, &buffers, ctx_->stream()); + + std::vector slimmed_x_v; + std::vector cls_inds_v; + + LOG(INFO) << "GetOutput"; + GetOutput(slimmed_x_v, cls_inds_v); + + ASSERT_EQ(cls_inds_v[0], 2); + ASSERT_EQ(cls_inds_v[1], 3); + ASSERT_EQ(cls_inds_v[2], 2); + ASSERT_EQ(cls_inds_v[3], 3); + ASSERT_EQ(cls_inds_v[4], 2); + ASSERT_EQ(cls_inds_v[5], 3); + ASSERT_EQ(cls_inds_v[6], 2); + ASSERT_EQ(cls_inds_v[7], 3); + LOG(INFO) << "finish"; +#endif +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/operators/fused_token_prune_op.cc b/paddle/fluid/operators/fused_token_prune_op.cc new file mode 100644 index 0000000000000..50ca45967b7bd --- /dev/null +++ b/paddle/fluid/operators/fused_token_prune_op.cc @@ -0,0 +1,187 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class FusedTokenPruneOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Attn", + "(Tensor)" + "The input of fused_token_prune op, whose shape should be [bsz, " + "num_head, max_seq_len, max_seq_len] and dtype should be " + "float32/float64," + "Attn is attention scores of input sequences which will be used " + "to sort another input tensor: X's indices so that " + "some elements of X with lower attention score will not be " + "considered after this op."); + + AddInput("X", + "(Tensor)" + "The input of fused_token_prune op, whose shape should be [bsz, " + "max_seq_len, c] and dtype should be float32/float64."); + + AddInput( + "Mask", + "(Tensor)" + "The input of fused_token_prune op, whose shape should be [bsz, " + "num_head, " + "max_seq_len, max_seq_len] and dtype should be float32/float64." + "Mask is corresponding to Attn's elemnts one by one. Elements of Attn " + "will be set to zero if their corresponding mask is smaller than 0." + "This process happens before sorting X by attn."); + + AddInput("NewMask", + "(Tensor)" + "The input of fused_token_prune op, whose shape should be [bsz, " + "num_head, slimmed_seq_len, slimmed_seq_len]." + "NewMask is just used to get slimmed_seq_len, so the value of " + "this input is not important in this op."); + + AddOutput("SlimmedX", + "(Tensor)" + "The output of fused_token_prune op, whose shape should be [bsz, " + "slimmed_seq_len, C]." + "The tokens of X will be sorted by Attn firstly and then the " + "last (max_seq_len - slimmed_seq_len)" + "tokens will be deleted. SlimmedX is the remainning part of X. " + ""); + + AddOutput( + "CLSInds", + "(Tensor)" + "The output of fused_token_prune op, whose shape should be [bsz, " + "slimmed_seq_len] and dtype is int64. CLSInds contains token indices " + " of each batch after sorting and pruning. "); + + AddAttr("keep_first_token", + "If keep_first_token is True, the element located in " + "CLSInds[:, 1] must be 0.") + .SetDefault(true); + + AddAttr("keep_order", + "If keep_order is True, the relative order of SlimmedX and " + "CLSInds remains unchanged") + .SetDefault(false); + + AddComment(R"DOC( + fused_token_prune op is used to fuse multiple ops to perform token pruning. + In this op: + 1. Elements of Attn will be set to zero if their corresponding mask is smaller than 0. + 2. The second dimension of X will be sorted by Attn. + 3. The last (max_seq_len - slimmed_seq_len) lines of X will be pruned. + 4. The remainning part of sorted X will output. + )DOC"); + } +}; + +class FusedTokenPruneOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Attn"), "Input", "Attn", "FusedTokenPrune"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedTokenPrune"); + OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "FusedTokenPrune"); + OP_INOUT_CHECK( + ctx->HasInput("NewMask"), "Input", "NewMask", "FusedTokenPrune"); + OP_INOUT_CHECK( + ctx->HasOutput("SlimmedX"), "Output", "SlimmedX", "FusedTokenPrune"); + OP_INOUT_CHECK( + ctx->HasOutput("CLSInds"), "Output", "CLSInds", "FusedTokenPrune"); + + auto mask_dim = ctx->GetInputDim("Mask"); + auto attn_dim = ctx->GetInputDim("Attn"); + auto x_dim = ctx->GetInputDim("X"); + auto new_mask_dim = ctx->GetInputDim("NewMask"); + + // check input dims number + PADDLE_ENFORCE_EQ(mask_dim.size(), + 4, + platform::errors::InvalidArgument( + "The input mask must be 4-dimention")); + PADDLE_ENFORCE_EQ(attn_dim.size(), + 4, + platform::errors::InvalidArgument( + "The input attn must be 4-dimention")); + PADDLE_ENFORCE_EQ( + x_dim.size(), + 3, + platform::errors::InvalidArgument("The input x must be 4-dimention")); + PADDLE_ENFORCE_EQ(new_mask_dim.size(), + 4, + platform::errors::InvalidArgument( + "The input attn must be 4-dimention")); + + // check input dims relations + PADDLE_ENFORCE_EQ(mask_dim[0], + attn_dim[0], + platform::errors::InvalidArgument( + "The first dim of mask and attn should be the same" + "which is batch size")); + PADDLE_ENFORCE_EQ(mask_dim[1], + attn_dim[1], + platform::errors::InvalidArgument( + "The second dim of mask and attn should be the same" + "which is nb_head")); + PADDLE_ENFORCE_EQ(mask_dim[0], + x_dim[0], + platform::errors::InvalidArgument( + "The first dim of mask and x should be the same" + "which is batch size")); + PADDLE_ENFORCE_EQ( + mask_dim[2], + mask_dim[3], + platform::errors::InvalidArgument( + "The third dim and the fourth dim of mask should be the same" + "which is max seq len")); + PADDLE_ENFORCE_EQ( + attn_dim[2], + attn_dim[3], + platform::errors::InvalidArgument( + "The third dim and the fourth dim of mask should be the same" + "which is max seq len")); + PADDLE_ENFORCE_EQ(attn_dim[2], + mask_dim[2], + platform::errors::InvalidArgument( + "The third dim of mask and attn should be the same" + "which is max seq len")); + PADDLE_ENFORCE_EQ(attn_dim[2], + x_dim[1], + platform::errors::InvalidArgument( + "The third dim of mask and the second dim of attn" + "should be the same which is max seq len")); + + auto bsz = mask_dim[0]; + auto c = x_dim[2]; + auto slim_seq_len = new_mask_dim[2]; + + ctx->SetOutputDim("SlimmedX", {bsz, slim_seq_len, c}); + ctx->SetOutputDim("CLSInds", {bsz, slim_seq_len}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + fused_token_prune, + ops::FusedTokenPruneOp, + ops::FusedTokenPruneOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/fused_token_prune_op.cu b/paddle/fluid/operators/fused_token_prune_op.cu new file mode 100644 index 0000000000000..90044f30d8a6e --- /dev/null +++ b/paddle/fluid/operators/fused_token_prune_op.cu @@ -0,0 +1,287 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#ifdef __NVCC__ +#include +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/phi/backends/gpu/gpu_launch_config.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/fused_token_prune_op.cu.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +struct AttnMaskFunctor { + inline HOSTDEVICE T operator()(const T a, const T b) const { + return b >= 0 ? a : 0; + } +}; + +__global__ void FillIndex(int64_t* indices, int num_raws, int num_cols) { + int num_threads = num_raws * num_cols; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (; tid < num_threads; tid += stride) { + int col = tid % num_cols; + indices[tid] = (int64_t)col; + } +} + +template +__global__ void TakeAlongAxis(const T* src, + T* dst, + int64_t* indices, + int num_raws, + int src_num_cols, + int dst_num_cols, + int num_elements) { + int num_threads = num_raws * dst_num_cols; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + + for (; tid < num_threads; tid += stride) { + int raw = tid / dst_num_cols; + int col = tid % dst_num_cols; + for (int i = 0; i < num_elements; ++i) { + dst[tid * num_elements + i] = + *(src + (raw * src_num_cols + indices[tid]) * num_elements + i); + } + } +} + +template +__global__ void MaximumFirst(T* mat, int num_raws, int num_cols, T max_value) { + int num_threads = num_raws; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + for (; tid < num_threads; tid += stride) { + mat[tid * num_cols] = max_value; + } +} + +template +class FusedTokenPruneOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = context.cuda_device_context(); + // Inouts + const Tensor* attn = context.Input("Attn"); + const Tensor* x = context.Input("X"); + const Tensor* mask = context.Input("Mask"); + const Tensor* new_mask = context.Input("NewMask"); + + // Input dims + auto attn_dims = attn->dims(); + auto x_dims = x->dims(); + auto new_mask_dims = new_mask->dims(); + + auto bsz = attn_dims[0]; + auto num_heads = attn_dims[1]; + auto max_seq_len = attn_dims[2]; + auto c = x_dims[2]; + int slimmed_x_len = new_mask_dims[2]; + + // Attrs + const bool keep_first_token = context.Attr("keep_first_token"); + const bool keep_order = context.Attr("keep_order"); + + // Outputs + Tensor* out_slimmed_x = context.Output("SlimmedX"); + Tensor* slimmed_indices = context.Output("CLSInds"); + auto* out_slimmed_x_data = + out_slimmed_x->mutable_data(context.GetPlace()); + auto* slimmed_indices_data = + slimmed_indices->mutable_data(context.GetPlace()); + + // Intermediate variable + Tensor attn_tmp; + auto* attn_tmp_data = + attn_tmp.mutable_data(attn_dims, context.GetPlace()); + Tensor attn_accu; + auto* attn_accu_data = + attn_accu.mutable_data({bsz, max_seq_len}, context.GetPlace()); + Tensor attn_accu_indices; + auto* attn_accu_indices_data = attn_accu_indices.mutable_data( + {bsz, max_seq_len}, context.GetPlace()); + Tensor sort_attn_accu; + auto* sort_attn_accu_data = + sort_attn_accu.mutable_data({bsz, max_seq_len}, context.GetPlace()); + Tensor sort_attn_accu_indices; + auto* sort_attn_accu_indices_data = + sort_attn_accu_indices.mutable_data({bsz, max_seq_len}, + context.GetPlace()); + Tensor temp_storage; + + // 1. Filter attn by mask + std::vector ins; + std::vector outs; + ins.emplace_back(attn); + ins.emplace_back(mask); + outs.emplace_back(&attn_tmp); + LaunchElementwiseCudaKernel( + dev_ctx, ins, &outs, -1, AttnMaskFunctor()); + + // 2. Reduce sum + const std::vector reduce_dims{1, 2}; + phi::Reduce(dev_ctx, + attn_tmp, + false, + reduce_dims, + false, + attn_accu.dtype(), + &attn_accu); + // 3. Prepare token indices + phi::backends::gpu::GpuLaunchConfig config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * max_seq_len); + FillIndex<<>>(attn_accu_indices_data, bsz, max_seq_len); + + // 4. Sort token indices by attn + if (keep_first_token) { + T max = std::numeric_limits::max(); + config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz); + MaximumFirst + <<>>(attn_accu_data, bsz, max_seq_len, max); + } + size_t temp_storage_bytes = -1; + int num_items = bsz * max_seq_len; + int num_segments = bsz; + + cub::CountingInputIterator counting_iter(0); + cub::TransformInputIterator> + segment_offsets_t(counting_iter, SegmentOffsetIter(max_seq_len)); + // Determine temporary device storage requirements + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceSegmentedRadixSort::SortPairsDescending( + nullptr, + temp_storage_bytes, + attn_accu_data, + sort_attn_accu_data, + attn_accu_indices_data, + sort_attn_accu_indices_data, + num_items, + num_segments, + segment_offsets_t, + segment_offsets_t + 1, + 0, + sizeof(T) * 8, + dev_ctx.stream())); + // Allocate temporary storage + int64_t temp_size = temp_storage_bytes; + auto* temp_storage_data = + temp_storage.mutable_data({temp_size}, context.GetPlace()); + // Run sorting operation + PADDLE_ENFORCE_GPU_SUCCESS( + cub::DeviceSegmentedRadixSort::SortPairsDescending( + temp_storage_data, + temp_storage_bytes, + attn_accu_data, + sort_attn_accu_data, + attn_accu_indices_data, + sort_attn_accu_indices_data, + num_items, + num_segments, + segment_offsets_t, + segment_offsets_t + 1, + 0, + sizeof(T) * 8, + dev_ctx.stream())); + // 5. Slice + auto slimmed_indices_tmp = + phi::funcs::Slice(dev_ctx, + sort_attn_accu_indices, + {1} /*axes*/, + {0} /*starts*/, + {slimmed_x_len} /*ends*/); + if (keep_order) { + // 6. reorder + num_items = bsz * slimmed_x_len; + temp_storage_bytes = -1; + cub::TransformInputIterator> + segment_offsets_t2(counting_iter, SegmentOffsetIter(slimmed_x_len)); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortKeys( + nullptr, + temp_storage_bytes, + static_cast(slimmed_indices_tmp.data()), + static_cast(slimmed_indices->data()), + num_items, + num_segments, + segment_offsets_t2, + segment_offsets_t2 + 1, + 0, + sizeof(int64_t) * 8, + dev_ctx.stream())); + temp_size = temp_storage_bytes; + temp_storage.Resize({temp_size}); + temp_storage_data = + temp_storage.mutable_data(context.GetPlace()); + PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedRadixSort::SortKeys( + temp_storage_data, + temp_storage_bytes, + static_cast(slimmed_indices_tmp.data()), + static_cast(slimmed_indices->data()), + num_items, + num_segments, + segment_offsets_t2, + segment_offsets_t2 + 1, + 0, + sizeof(int64_t) * 8, + dev_ctx.stream())); + } else { + framework::TensorCopy( + slimmed_indices_tmp, context.GetPlace(), slimmed_indices); + } + // 7. Get slimmed X by indices + config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, bsz * slimmed_x_len); + TakeAlongAxis<<>>(x->data(), + out_slimmed_x_data, + slimmed_indices->data(), + bsz, + max_seq_len, + slimmed_x_len, + c); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(fused_token_prune, + ops::FusedTokenPruneOpCUDAKernel, + ops::FusedTokenPruneOpCUDAKernel); diff --git a/paddle/fluid/operators/fused_token_prune_op.cu.h b/paddle/fluid/operators/fused_token_prune_op.cu.h new file mode 100644 index 0000000000000..e1e73a5e3d9e2 --- /dev/null +++ b/paddle/fluid/operators/fused_token_prune_op.cu.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" +#include "paddle/phi/kernels/funcs/slice.h" +#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + +namespace paddle { +namespace operators { + +HOSTDEVICE inline int CeilDivide(int n, int m) { return (n + m - 1) / m; } + +inline int ComputeBlockSize(int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; +} + +// Iter for move to next row +struct SegmentOffsetIter { + EIGEN_DEVICE_FUNC + explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const { + return idx * num_cols_; + } + + int num_cols_; +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index 6f79a248cf38b..7a67bf95d15a8 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -28,6 +28,13 @@ if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_c_allreduce") endif() +if(WIN32) + list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES + "test_trt_convert_fused_token_prune") + list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_fused_token_prune") + list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_fused_token_prune") +endif() + # Only for cpu(mkl + openblas) set(TEST_INFERENCE_CPU_UT "test_mul_lstm_fuse_pass" "test_mul_gru_fuse_pass") diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fused_token_prune.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fused_token_prune.py new file mode 100644 index 0000000000000..85c56506de5cf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fused_token_prune.py @@ -0,0 +1,129 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest + + +class TrtConvertFusedTokenPruneTest(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_attn_or_mask(attrs: List[Dict[str, Any]]): + return np.ones([4, 12, 64, 64]).astype(np.float32) + + def generate_x(attrs: List[Dict[str, Any]]): + return np.random.random([4, 64, 76]).astype(np.float32) + + def generate_new_mask(attrs: List[Dict[str, Any]]): + return np.random.random([4, 12, 32, 32]).astype(np.float32) + + for keep_first_token in [True, False]: + for keep_order in [True, False]: + dics = [{ + "keep_first_token": keep_first_token, + "keep_order": keep_order + }] + ops_config = [{ + "op_type": "fused_token_prune", + "op_inputs": { + "Attn": ["attn"], + "X": ["x"], + "Mask": ["mask"], + "NewMask": ["new_mask"] + }, + "op_outputs": { + "SlimmedX": ["slimmed_x"], + "CLSInds": ["cls_inds"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "attn": + TensorConfig( + data_gen=partial(generate_attn_or_mask, dics)), + "x": + TensorConfig(data_gen=partial(generate_x, dics)), + "mask": + TensorConfig( + data_gen=partial(generate_attn_or_mask, dics)), + "new_mask": + TensorConfig(data_gen=partial(generate_new_mask, dics)) + }, + outputs=["slimmed_x", "cls_inds"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = { + "attn": [4, 12, 64, 64], + "x": [4, 64, 76], + "mask": [4, 12, 64, 64], + "new_mask": [4, 12, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "attn": [4, 12, 64, 64], + "x": [4, 64, 76], + "mask": [4, 12, 64, 64], + "new_mask": [4, 12, 32, 32] + } + self.dynamic_shape.opt_input_shape = { + "attn": [4, 12, 64, 64], + "x": [4, 64, 76], + "mask": [4, 12, 64, 64], + "new_mask": [4, 12, 32, 32] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 6 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5, 1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5, 1e-5, 1e-5) + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fused_token_prune_op.py b/python/paddle/fluid/tests/unittests/test_fused_token_prune_op.py new file mode 100644 index 0000000000000..9425283f078c0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_token_prune_op.py @@ -0,0 +1,112 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle +from op_test import OpTest +from paddle.framework import core + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFusedTokenPruneOp(OpTest): + + def setDtype(self): + self.dtype = np.float32 + + def setInouts(self): + attn = [[1, 2], [3, 4]] + attn = np.array(attn, dtype=self.dtype) + attn = np.expand_dims(attn, axis=0) + self.attn = np.expand_dims( + attn, axis=0) # [1,1,2,2] bsz = 1, nd_head=1, max_seq_len=2 + mask = [[1, 1], [-1, -1]] + mask = np.array(mask, dtype=self.dtype) + mask = np.expand_dims(mask, axis=0) + self.mask = np.expand_dims(mask, axis=0) # same as attn + x = [[1, 2, 3], [4, 5, 6]] + x = np.array(x, dtype=self.dtype) + self.x = np.expand_dims(x, + axis=0) # [1, 2, 3] bsz = 1, max_seq_len=2, c=3 + new_mask = [[1]] + new_mask = np.array(new_mask, dtype=self.dtype) + new_mask = np.expand_dims(new_mask, axis=0) + self.new_mask = np.expand_dims(new_mask, axis=0) #[1, 1, 1, 1] + + out_slimmedx_py = [[[1, 2, 3]]] + self.out_slimmedx_py = np.array(out_slimmedx_py, dtype=self.dtype) + + out_cls_inds_py = [[0]] + self.out_cls_inds_py = np.array(out_cls_inds_py, dtype='int64') + + def setUp(self): + self.op_type = 'fused_token_prune' + self.setDtype() + self.setInouts() + self.inputs = { + 'Attn': self.attn, + 'Mask': self.mask, + 'X': self.x, + 'NewMask': self.new_mask + } + + self.outputs = { + 'SlimmedX': self.out_slimmedx_py, + 'CLSInds': self.out_cls_inds_py + } + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0)) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFusedTokenPruneOpFloat64(TestFusedTokenPruneOp): + + def setDtype(self): + self.dtype = np.float64 + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFusedTokenPruneOp2(TestFusedTokenPruneOp): + + def setInouts(self): + attn = [[[[1, 2, 3, 4], [4, 3, 2, 1], [5, 9, 5, 4], [9, 6, 5, 4]], + [[8, 5, 2, 0], [1, 0, 2, 3], [2, 2, 3, 2], [7, 4, 1, 8]]]] + self.attn = np.array( + attn, + dtype=self.dtype) # [1,2,4,4] bsz = 1, nd_head=2, max_seq_len=4 + mask = [[[[-1, -1, -1, 1], [-1, -1, 1, 1], [-1, -1, 1, 1], + [-1, -1, 1, 1]], + [[-1, -1, 1, 1], [-1, -1, 1, 1], [-1, -1, 1, 1], + [-1, -1, 1, 1]]]] + self.mask = np.array(mask, dtype=self.dtype) # same as attn + x = [[[1.1, 1.1, 1.1], [2.2, 2.2, 2.2], [3.3, 3.3, 3.3], + [4.4, 4.4, 4.4]]] + self.x = np.array( + x, dtype=self.dtype) # [1, 4, 3] bsz = 1, max_seq_len=4, c=3 + self.new_mask = np.random.rand(1, 2, 2, + 2).astype(self.dtype) #[1, 2, 2, 2] + + out_slimmedx_py = [[[1.1, 1.1, 1.1], [4.4, 4.4, 4.4]]] #[1, 2, 3] + self.out_slimmedx_py = np.array(out_slimmedx_py, dtype=self.dtype) + + out_cls_inds_py = [[0, 3]] + self.out_cls_inds_py = np.array(out_cls_inds_py, dtype='int64') + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 95c5ecf713112..7e92b6b9b7afc 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -233,6 +233,7 @@ 'test_fused_elemwise_activation_op', 'test_fused_emb_seq_pool_op', 'test_fused_embedding_fc_lstm_op', + 'test_fused_token_prune_op', 'test_fusion_gru_op', 'test_fusion_lstm_op', 'test_fusion_repeated_fc_relu_op', From 676d0b42fdf482e7a9dd80d384ca8139041a0d14 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Fri, 15 Jul 2022 14:52:41 +0800 Subject: [PATCH 218/250] Fix run inference bug for standalone executor (#44340) --- python/paddle/fluid/executor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index fac39df117bef..cf00075edcf86 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1404,10 +1404,14 @@ def _can_use_interpreter_core(program, place): return False # Unsupported case 3: data parallel - if program._is_data_parallel == True and len( + if program._is_data_parallel and len( program._get_places(place, program._places)) != 1: return False + # Unsupported case 4: inference + if program._is_inference: + return False + return True else: if isinstance(program._graph, compiler.CompiledProgram): From 9181a99bddd8f992b72cee94121adbc17e7131a0 Mon Sep 17 00:00:00 2001 From: taixiurong Date: Fri, 15 Jul 2022 15:44:56 +0800 Subject: [PATCH 219/250] =?UTF-8?q?xpu-paddlepaddle-33=20[=E4=BB=BB?= =?UTF-8?q?=E5=8A=A1]=20matmul=E5=8D=95=E6=B5=8B=20timeout=20(#44333)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test=kunlun --- .../fluid/tests/unittests/xpu/CMakeLists.txt | 4 +++- .../unittests/xpu/get_test_cover_info.py | 1 - .../tests/unittests/xpu/test_matmul_op_xpu.py | 20 +++++++++++++++++++ .../unittests/xpu/test_matmul_v2_op_xpu.py | 5 +++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt index c6aaf363138d4..cf70f63580b99 100644 --- a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt @@ -24,5 +24,7 @@ foreach(TEST_OP ${DIST_TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach() -set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120) set_tests_properties(test_conv2d_op_xpu PROPERTIES TIMEOUT 120) +set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120) +set_tests_properties(test_matmul_v2_op_xpu PROPERTIES TIMEOUT 900) +set_tests_properties(test_matmul_op_xpu PROPERTIES TIMEOUT 300) diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py index 3da9e32b015ed..f58c0d4cf074c 100644 --- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py +++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py @@ -87,7 +87,6 @@ xpu_test_op_type_white_list = [ 'dropout_float16', 'dropout_grad_float16', - 'matmul_v2_float16', "grad_add_float32" # no api for grad_add, skip ] xpu_test_device_op_white_list = [] diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py index 1c68f8fb6bf16..73f61c2d9d5ba 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py @@ -294,6 +294,10 @@ def setUp(self): self.op_type = "matmul" self.dtype = np.float32 if not hasattr(self, 'in_type') else self.in_type + + self.__class__.no_need_check_grad = False if not hasattr( + self, 'no_need_check_grad') else self.no_need_check_grad + shape_X = [4, 5] if not hasattr(self, 'shape_X') else self.shape_X shape_Y = [5, 6] if not hasattr(self, 'shape_Y') else self.shape_Y transpose_X = False if not hasattr(self, @@ -314,12 +318,20 @@ def test_check_output(self): self.check_output_with_place(place, atol=1e-3) def test_check_grad_normal(self): + if hasattr(self.__class__, "no_need_check_grad" + ) and self.__class__.no_need_check_grad == True: + return + place = paddle.XPUPlace(0) self.check_grad_with_place(place, ['X', 'Y'], 'Out', max_relative_error=5e-2) def test_check_grad_ignore_x(self): + if hasattr(self.__class__, "no_need_check_grad" + ) and self.__class__.no_need_check_grad == True: + return + place = paddle.XPUPlace(0) self.check_grad_with_place(place, ['Y'], 'Out', @@ -327,6 +339,10 @@ def test_check_grad_ignore_x(self): no_grad_set=set("X")) def test_check_grad_ignore_y(self): + if hasattr(self.__class__, "no_need_check_grad" + ) and self.__class__.no_need_check_grad == True: + return + place = paddle.XPUPlace(0) self.check_grad_with_place(place, ['X'], 'Out', @@ -351,6 +367,9 @@ def dynamic_create_class(self): for transose_x in [True, False]: for transose_y in [True, False]: for batch in batch_size: + no_need_check_grad = False + if batch >= 5: + no_need_check_grad = True class_name = ( 'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}' .format(dim_X, dim_Y, transose_x, transose_y, @@ -362,6 +381,7 @@ def dynamic_create_class(self): 'shape_Y': shape_y, 'transpose_X': transose_x, 'transpose_Y': transose_y, + 'no_need_check_grad': no_need_check_grad, 'op_type': "matmul" } classes.append([class_name, attr_dict]) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py index 8f31981355403..92b9ae3ae8998 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py @@ -80,6 +80,8 @@ def setUp(self): self.dtype = self.in_type self.config() self.op_type = "matmul_v2" + if self.dtype == np.float16 or self.dtype == "float16": + self.__class__.no_need_check_grad = True x = np.random.random(self.x_shape).astype(self.dtype) y = np.random.random(self.y_shape).astype(self.dtype) # -0.1 ~ 0.1 @@ -99,6 +101,9 @@ def test_check_output(self): self.check_output_with_place(place) def test_check_grad(self): + if hasattr(self.__class__, "no_need_check_grad" + ) and self.__class__.no_need_check_grad == True: + return place = paddle.XPUPlace(0) self.check_grad_with_place(place, ['X', 'Y'], 'Out') From c8e26fea5e2d3ac3c14eb12493e1ef784dbf1a4e Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Fri, 15 Jul 2022 17:04:08 +0800 Subject: [PATCH 220/250] [IPU] add custom-op UTs 0/N (#44328) * add custom-op UTs 0 * add authors Co-authored-by: Allen Guo Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen --- .../fluid/tests/unittests/ipu/CMakeLists.txt | 4 +- .../unittests/ipu/custom_ops/CMakeLists.txt | 12 ++ .../ipu/custom_ops/custom_checkpointoutput.cc | 43 +++++ .../unittests/ipu/custom_ops/custom_detach.cc | 42 +++++ .../ipu/custom_ops/custom_identity.cc | 42 +++++ .../unittests/ipu/custom_ops/custom_nll.cc | 62 +++++++ .../ipu/custom_ops/deprecated/CMakeLists.txt | 3 + .../custom_ops/deprecated/custom_nllloss.cc | 53 ++++++ .../deprecated/test_custom_nllloss_ipu.py | 110 +++++++++++ .../custom_ops/test_checkpointoutput_ipu.py | 88 +++++++++ .../ipu/custom_ops/test_custom_ops_ipu.py | 174 ++++++++++++++++++ 11 files changed, 632 insertions(+), 1 deletion(-) create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_checkpointoutput.cc create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_detach.cc create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_identity.cc create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_nll.cc create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/custom_nllloss.cc create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/test_custom_nllloss_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/test_checkpointoutput_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_ops_ipu.py diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt index 0174274827358..2b698ce9363fd 100644 --- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt @@ -4,7 +4,6 @@ if(WITH_IPU) RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) # set all UTs timeout to 200s @@ -15,4 +14,7 @@ if(WITH_IPU) set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300) set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600) set_tests_properties(test_save_load_ipu PROPERTIES TIMEOUT 600) + + add_subdirectory(custom_ops) + endif() diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/custom_ops/CMakeLists.txt new file mode 100644 index 0000000000000..d7615f933aad0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/CMakeLists.txt @@ -0,0 +1,12 @@ +if(WITH_IPU) + file( + GLOB CUSTOM_OP_TESTS + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "test_*.py") + string(REPLACE ".py" "" CUSTOM_OP_TESTS "${CUSTOM_OP_TESTS}") + foreach(CUSTOM_OP_TEST ${CUSTOM_OP_TESTS}) + py_test(${CUSTOM_OP_TEST} SRCS ${CUSTOM_OP_TEST}.py) + endforeach() + + add_subdirectory(deprecated) +endif() diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_checkpointoutput.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_checkpointoutput.cc new file mode 100644 index 0000000000000..c2957ba224886 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_checkpointoutput.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/extension.h" + +namespace { +std::vector> InferShape(std::vector x_shape) { + return {x_shape}; +} + +std::vector InferDtype(paddle::DataType x_dtype) { + return {x_dtype}; +} + +std::vector OpForward(const paddle::Tensor &x) { return {x}; } + +std::vector OpBackward(const paddle::Tensor &x) { return {x}; } +} // namespace + +// https://github.com/graphcore/popart/blob/sdk-release-2.5/willow/src/builder_impl.cpp#L1458 +// only support one input +PD_BUILD_OP(checkpointoutput) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetInferShapeFn(PD_INFER_SHAPE(InferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype)) + .SetKernelFn(PD_KERNEL(OpForward)); + +PD_BUILD_GRAD_OP(checkpointoutput) + .Inputs({paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .SetKernelFn(PD_KERNEL(OpBackward)); diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_detach.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_detach.cc new file mode 100644 index 0000000000000..2eb62599c0e36 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_detach.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/extension.h" + +namespace { +std::vector> InferShape(std::vector x_shape) { + return {x_shape}; +} + +std::vector InferDtype(paddle::DataType x_dtype) { + return {x_dtype}; +} + +std::vector OpForward(const paddle::Tensor &x) { return {x}; } + +std::vector OpBackward(const paddle::Tensor &x) { return {x}; } +} // namespace + +// https://github.com/graphcore/popart/blob/sdk-release-2.5/willow/src/builder.cpp#L502 +PD_BUILD_OP(custom_detach) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetInferShapeFn(PD_INFER_SHAPE(InferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype)) + .SetKernelFn(PD_KERNEL(OpForward)); + +PD_BUILD_GRAD_OP(custom_detach) + .Inputs({paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .SetKernelFn(PD_KERNEL(OpBackward)); diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_identity.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_identity.cc new file mode 100644 index 0000000000000..0ed9cc7440f0b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_identity.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/extension.h" + +namespace { +std::vector> InferShape(std::vector x_shape) { + return {x_shape}; +} + +std::vector InferDtype(paddle::DataType x_dtype) { + return {x_dtype}; +} + +std::vector OpForward(const paddle::Tensor &x) { return {x}; } + +std::vector OpBackward(const paddle::Tensor &x) { return {x}; } +} // namespace + +// https://github.com/graphcore/popart/blob/sdk-release-2.5/willow/src/builder.gen.cpp#L620 +PD_BUILD_OP(custom_identity) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetInferShapeFn(PD_INFER_SHAPE(InferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype)) + .SetKernelFn(PD_KERNEL(OpForward)); + +PD_BUILD_GRAD_OP(custom_identity) + .Inputs({paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .SetKernelFn(PD_KERNEL(OpBackward)); diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_nll.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_nll.cc new file mode 100644 index 0000000000000..f08c1c326baca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/custom_nll.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/extension.h" + +namespace { +std::vector> InferShape( + std::vector x_shape, + std::vector y_shape, + const std::string &reduction, + const int &ignoreIndex, + const bool &inputIsLogProbability) { + // reduction type: Sum, Mean, None + if (reduction == "None") { + return {y_shape}; + } else { + return {{1}}; + } +} + +std::vector InferDtype(paddle::DataType x_dtype, + paddle::DataType y_dtype) { + return {x_dtype}; +} + +std::vector OpForward(const paddle::Tensor &x, + const paddle::Tensor &y) { + return {x}; +} + +std::vector OpBackward(const paddle::Tensor &x) { return {x}; } +} // namespace + +// https://github.com/graphcore/popart/blob/sdk-release-2.5/willow/src/builder.cpp#L775 +// type of `reduction` is std::string +// `ignoreIndex` is optional, if no need, need to remove it manually(which is a +// new custom op in paddle) +PD_BUILD_OP(custom_nll) + .Inputs({"X", "Y"}) + .Outputs({"Out"}) + .Attrs({"reduction: std::string", + "ignoreIndex: int", + "inputIsLogProbability: bool"}) + .SetInferShapeFn(PD_INFER_SHAPE(InferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype)) + .SetKernelFn(PD_KERNEL(OpForward)); + +PD_BUILD_GRAD_OP(custom_nll) + .Inputs({paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .SetKernelFn(PD_KERNEL(OpBackward)); diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/CMakeLists.txt new file mode 100644 index 0000000000000..c132a2517e80b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/CMakeLists.txt @@ -0,0 +1,3 @@ +if(WITH_IPU) + py_test(test_custom_nllloss_ipu SRCS test_custom_nllloss_ipu.py) +endif() diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/custom_nllloss.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/custom_nllloss.cc new file mode 100644 index 0000000000000..a4f123144d39a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/custom_nllloss.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/extension.h" + +std::vector Kernel_Function() { return {}; } +std::vector Kernel_Function_Grad() { return {}; } + +// nllloss +std::vector> InferShape_NllLoss( + std::vector x_shape, + std::vector y_shape, + const int& reduction, + const std::string& ignoreIndex, + const bool& inputIsLogProbability) { + // 0: sum, 1: mean, 2: none + if (reduction == 2) { + return {y_shape}; + } else { + return {{1}}; + } +} + +std::vector InferDtype_NllLoss(paddle::DataType x_dtype, + paddle::DataType y_dtype) { + return {x_dtype}; +} + +PD_BUILD_OP(custom_nll_loss) + .Inputs({"X", "Y"}) + .Outputs({"Out"}) + .Attrs({"reduction: int", + "ignoreIndex: std::string", + "inputIsLogProbability: bool"}) + .SetKernelFn(PD_KERNEL(Kernel_Function)) + .SetInferShapeFn(PD_INFER_SHAPE(InferShape_NllLoss)) + .SetInferDtypeFn(PD_INFER_DTYPE(InferDtype_NllLoss)); + +PD_BUILD_GRAD_OP(custom_nll_loss) + .Inputs({paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .SetKernelFn(PD_KERNEL(Kernel_Function_Grad)); diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/test_custom_nllloss_ipu.py b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/test_custom_nllloss_ipu.py new file mode 100644 index 0000000000000..9ae7b307ca543 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/deprecated/test_custom_nllloss_ipu.py @@ -0,0 +1,110 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import sys + +import numpy as np +import paddle +import paddle.static +from paddle.utils.cpp_extension import load + +sys.path.append( + os.path.dirname(os.path.dirname(os.path.dirname( + os.path.abspath(__file__))))) +from op_test_ipu import IPUOpTest + + +def load_custom_ops(): + cur_dir = os.path.dirname(os.path.realpath(__file__)) + custom_ops = load(name="custom_nll_loss", + sources=[f"{cur_dir}/custom_nllloss.cc"], + extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'], + extra_ldflags=['-lpopfloat']) + return custom_ops + + +class TestBase(IPUOpTest): + + def setUp(self): + self.load_custom_ops() + self.set_atol() + self.set_test_op() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return False + + def load_custom_ops(self): + self.custom_ops = load_custom_ops() + + def set_data_feed(self): + x = np.random.rand(16, 20, 256).astype('float32') + label = np.random.uniform(0, 256, size=[16, 20]).astype('int32') + self.feed_fp32 = { + 'x': x, + 'label': label, + } + + def set_test_op(self): + self.op = self.custom_ops.custom_nll_loss + self.op_attrs = { + "reduction": 0, + "ignoreindex": "0", + "inputislogprobability": False, + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + label = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='int32') + out = self.op(x, label, **self.op_attrs) + out = paddle.mean(out) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + self.build_model() + # only test IPU_FP32 + self.run_model(IPUOpTest.ExecutionMode.IPU_FP32) + print(self.output_dict) + + +class TestCase1(TestBase): + + def set_test_op(self): + self.op = self.custom_ops.custom_nll_loss + self.op_attrs = { + "reduction": 0, + "ignoreindex": "None", + "inputislogprobability": False, + } + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_checkpointoutput_ipu.py b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_checkpointoutput_ipu.py new file mode 100644 index 0000000000000..698cef211db66 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_checkpointoutput_ipu.py @@ -0,0 +1,88 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import sys + +import numpy as np +import paddle +import paddle.static +from paddle.utils.cpp_extension import load + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from op_test_ipu import IPUOpTest + + +def load_custom_ops(): + cur_dir = os.path.dirname(os.path.realpath(__file__)) + custom_ops = load(name="checkpointoutput", + sources=[ + f"{cur_dir}/custom_checkpointoutput.cc", + ], + extra_cxx_cflags=['-DONNX_NAMESPACE=onnx']) + return custom_ops + + +class TestCheckpointoutput(IPUOpTest): + + def setUp(self): + self.load_custom_ops() + self.set_atol() + self.set_test_op() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return False + + def load_custom_ops(self): + self.custom_ops = load_custom_ops() + + def set_test_op(self): + self.op = self.custom_ops.checkpointoutput + self.op_attrs = {} + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + x = paddle.add(x, x) + x = self.op(x, **self.op_attrs) + x = paddle.mean(x) + self.fetch_list = [x.name] + + def run_model(self, exec_mode): + self.run_op_test(exec_mode) + + def test(self): + self.build_model() + # only test IPU_FP32 + self.run_model(IPUOpTest.ExecutionMode.IPU_FP32) + print(self.output_dict) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_ops_ipu.py b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_ops_ipu.py new file mode 100644 index 0000000000000..0dc182354e5e0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_ops_ipu.py @@ -0,0 +1,174 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import sys + +import numpy as np +import paddle +import paddle.static +from paddle.utils.cpp_extension import load + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from op_test_ipu import IPUOpTest + + +# just load one custom-op for the data race issue under parallel mode +def load_custom_detach(): + cur_dir = os.path.dirname(os.path.realpath(__file__)) + custom_ops = load(name=f"custom_detach", + sources=[ + f"{cur_dir}/custom_detach.cc", + ], + extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'], + extra_ldflags=['-lpopfloat']) + return custom_ops + + +def load_custom_identity(): + cur_dir = os.path.dirname(os.path.realpath(__file__)) + custom_ops = load(name=f"custom_identity", + sources=[ + f"{cur_dir}/custom_identity.cc", + ], + extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'], + extra_ldflags=['-lpopfloat']) + return custom_ops + + +def load_custom_nll(): + cur_dir = os.path.dirname(os.path.realpath(__file__)) + custom_ops = load(name=f"custom_nll", + sources=[ + f"{cur_dir}/custom_nll.cc", + ], + extra_cxx_cflags=['-DONNX_NAMESPACE=onnx'], + extra_ldflags=['-lpopfloat']) + return custom_ops + + +def build_ipu_strategy(): + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.add_custom_op(paddle_op="custom_detach", + popart_op="Detach", + domain="ai.graphcore", + version=1) + ipu_strategy.add_custom_op(paddle_op="custom_identity", + popart_op="Identity", + domain="ai.onnx", + version=11) + ipu_strategy.add_custom_op(paddle_op="custom_nll", + popart_op="Nll", + domain="ai.graphcore", + version=1) + return ipu_strategy + + +class TestBase(IPUOpTest): + + def setUp(self): + self.load_custom_ops() + self.set_atol() + self.set_test_op() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return False + + def load_custom_ops(self): + self.custom_ops = load_custom_detach() + + def set_test_op(self): + self.op = self.custom_ops.custom_detach + self.op_attrs = {} + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + out = self.op(x, **self.op_attrs) + out = paddle.mean(out) + self.fetch_list = [out.name] + + def run_model(self, exec_mode): + ipu_strategy = build_ipu_strategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + self.run_op_test(exec_mode, ipu_strategy=ipu_strategy) + + def test(self): + self.build_model() + # only test IPU_FP32 + self.run_model(IPUOpTest.ExecutionMode.IPU_FP32) + print(self.output_dict) + + +class TestIdentity(TestBase): + + def load_custom_ops(self): + self.custom_ops = load_custom_identity() + + def set_test_op(self): + self.op = self.custom_ops.custom_identity + self.op_attrs = {} + + +class TestNll(TestBase): + + def load_custom_ops(self): + self.custom_ops = load_custom_nll() + + def set_data_feed(self): + x = np.random.rand(16, 20, 256).astype('float32') + label = np.random.uniform(0, 256, size=[16, 20]).astype('int32') + self.feed_fp32 = { + 'x': x, + 'label': label, + } + + def set_test_op(self): + self.op = self.custom_ops.custom_nll + self.op_attrs = { + "reduction": "Sum", + "ignoreindex": 0, + "inputislogprobability": False, + } + + @IPUOpTest.static_graph + def build_model(self): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + label = paddle.static.data(name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='int32') + out = self.op(x, label, **self.op_attrs) + out = paddle.mean(out) + self.fetch_list = [out.name] + + +if __name__ == "__main__": + unittest.main() From 2c8c8419095c6724691bc7a0be6002c722611eb4 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Fri, 15 Jul 2022 17:05:21 +0800 Subject: [PATCH 221/250] [IPU] add custom-op UTs 1/N (#44329) * add custom-op UTs 1 * add authors Co-authored-by: Allen Guo Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen * update url Co-authored-by: Zhixin Yao Co-authored-by: Zhaorui Chen --- .../tests/unittests/ipu/custom_ops/README.md | 71 ++++++ .../ipu/custom_ops/leaky_relu_cpu.cc | 111 +++++++++ .../ipu/custom_ops/leaky_relu_ipu.cc | 229 ++++++++++++++++++ .../custom_ops/test_custom_leaky_relu_ipu.py | 124 ++++++++++ 4 files changed, 535 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/README.md create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_cpu.cc create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_ipu.cc create mode 100644 python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_leaky_relu_ipu.py diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/README.md b/python/paddle/fluid/tests/unittests/ipu/custom_ops/README.md new file mode 100644 index 0000000000000..efac2a764ad10 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/README.md @@ -0,0 +1,71 @@ +# Add custom op for Paddle on IPU + +## Add custom op in Paddle + +reference + +https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html + +## Write custom op for PopART + +reference + +https://docs.graphcore.ai/projects/popart-user-guide/en/latest/custom_ops.html + +## Register custom op for Paddle on IPU + +这里采用即时编译(JIT Compile) 的方法使用 custom op. + +### 实现 custom op + +根据上面的两个文档, 首先添加 custom op 的实现. + +`leaky_relu_cpu.cc` 包含了 Paddle 中 custom op 的定义和 cpu 实现, 这里的实现是和标准的 Paddle 添加 custom op 是完全一致的. 这里的 cpu 实现不是必须的, cpu 实现可以用来检验 ipu 实现的正确性. + +`leaky_relu_ipu.cc` 包含了 PopART 中 custom op 的定义和 ipu 实现, 同样的, 这里的实现和标准的 PopART 添加 custom op 是完全一致的. + +### 载入 custom op + +分别在 Paddle 和 PopART 中实现 custom op 的定义后, 使用 `paddle.utils.cpp_extension.load` 编译源文件并把对应的动态库加载到当前进程中. + +```python + +cur_dir = os.path.dirname(os.path.realpath(__file__)) +custom_ops = load( + name="custom_jit_ops", + sources=[ + f"{cur_dir}/leaky_relu_cpu.cc", + f"{cur_dir}/leaky_relu_ipu.cc", + ], + # 编译 leaky_relu_ipu.cc 时需要添加此参数 + extra_cxx_cflags=['-DONNX_NAMESPACE=onnx']) + +``` + +由于 Paddle 中 op 的定义和 PopART 中存在一些差异, 需要手动映射 custom op + +```python + +# paddle_op is custom op type in Paddle +# popart_op, domain and version is custom op identifier in PopART +ipu_strategy = paddle.static.IpuStrategy() +ipu_strategy.add_custom_op( + paddle_op="custom_leaky_relu", + popart_op="LeakyRelu", + domain='custom.ops', + version=1) + +``` + +### 使用 custom op + +```python + +x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype=self.feed_dtype[0]) +# custom op +out = custom_ops.custom_leaky_relu(x, **self.attrs) + +``` diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_cpu.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_cpu.cc new file mode 100644 index 0000000000000..d118aa4380246 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_cpu.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/extension.h" + +#define CHECK_INPUT(x) \ + PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") + +template +void leaky_relu_cpu_forward_kernel(const data_t* x_data, + data_t* out_data, + int64_t x_numel, + float alpha) { + // x < 0.0f ? alpha * x : x + for (int i = 0; i < x_numel; ++i) { + if (x_data[i] > static_cast(0.)) { + out_data[i] = x_data[i]; + } else { + out_data[i] = static_cast(alpha) * x_data[i]; + } + } +} + +template +void leaky_relu_cpu_backward_kernel(const data_t* grad_out_data, + const data_t* out_data, + data_t* grad_x_data, + int64_t out_numel, + float alpha) { + // (grad * (x < 0.0f ? alpha : 1)) + for (int i = 0; i < out_numel; ++i) { + if (out_data[i] static_cast(0)) { + grad_x_data[i] = static_cast(alpha); + } else { + grad_x_data[i] = static_cast(1.); + } + } +} + +std::vector LeakyReluCPUForward(const paddle::Tensor& x, + float alpha) { + CHECK_INPUT(x); + + auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); + + PD_DISPATCH_FLOATING_TYPES(x.type(), "relu_cpu_forward_kernel", ([&] { + leaky_relu_cpu_forward_kernel( + x.data(), + out.mutable_data(x.place()), + x.size(), + alpha); + })); + + return {out}; +} + +std::vector LeakyReluCPUBackward(const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out, + float alpha) { + CHECK_INPUT(x); + CHECK_INPUT(out); + CHECK_INPUT(grad_out); + + auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); + + PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward_kernel", ([&] { + leaky_relu_cpu_backward_kernel( + grad_out.data(), + out.data(), + grad_x.mutable_data(x.place()), + out.size(), + alpha); + })); + + return {grad_x}; +} + +std::vector> LeakyReluInferShape( + std::vector x_shape) { + return {x_shape}; +} + +std::vector LeakyReluInferDtype(paddle::DataType x_dtype) { + return {x_dtype}; +} + +PD_BUILD_OP(custom_leaky_relu) + .Inputs({"X"}) + .Outputs({"Out"}) + .Attrs({"alpha: float"}) + .SetKernelFn(PD_KERNEL(LeakyReluCPUForward)) + .SetInferShapeFn(PD_INFER_SHAPE(LeakyReluInferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(LeakyReluInferDtype)); + +PD_BUILD_GRAD_OP(custom_leaky_relu) + .Inputs({"X", "Out", paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .Attrs({"alpha: float"}) + .SetKernelFn(PD_KERNEL(LeakyReluCPUBackward)); diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_ipu.cc b/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_ipu.cc new file mode 100644 index 0000000000000..1fea75b3b5ae7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/leaky_relu_ipu.cc @@ -0,0 +1,229 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include + +namespace CustomOperators { +const popart::OperatorIdentifier LeakyReluId = {"custom.ops", "LeakyRelu", 1}; +} // namespace CustomOperators +namespace CustomGradOperators { +const popart::OperatorIdentifier LeakyReluGradId = { + "custom.ops", "LeakyReluGrad", 1}; +} // namespace CustomGradOperators + +class LeakyReluOp; +class LeakyReluOpx; +class LeakyReluGradOpx; + +class LeakyReluGradOp : public popart::Op { + public: + explicit LeakyReluGradOp(const LeakyReluOp &fwdOp); + + std::unique_ptr clone() const final { + return std::make_unique(*this); + } + void setup() final { outInfo(0) = inInfo(0); }; + + const std::vector &gradInputInfo() const; + + // The Grad Op has 1 output, which is the gradient of the only input + const std::map &gradOutToNonGradIn() const; + + bool requiresRandomSeed() const override { return false; } + + // an estimate of how valuable sub-graph matching will be + float getSubgraphValue() const final { return getHighSubgraphValue(); } + + float getAlpha() const { return alpha; } + + // Implementation defined below + void appendAttributes(popart::OpSerialiserBase &os) const override; + + // Implementation defined below + void appendOutlineAttributes(popart::OpSerialiserBase &os) const override; + + private: + float alpha; +}; + +class LeakyReluOp : public popart::Op { + public: + LeakyReluOp(const popart::OperatorIdentifier &_opid, + float _alpha, + const popart::Op::Settings &settings_) + : popart::Op(_opid, settings_), alpha(_alpha) {} + + std::unique_ptr clone() const final { + return std::make_unique(*this); + } + + void setup() final { outInfo(0) = inInfo(0); } + + void appendAttributes(popart::OpSerialiserBase &os) const override { + Op::appendAttributes(os); + os.appendAttribute("alpha", getAlpha()); + } + + void appendOutlineAttributes(popart::OpSerialiserBase &os) const override { + Op::appendOutlineAttributes(os); + os.appendAttribute("alpha", getAlpha()); + } + + std::vector> getGradOps() { + std::vector> upops; + upops.emplace_back(new LeakyReluGradOp(*this)); + return upops; + } + + float getSubgraphValue() const final { return getHighSubgraphValue(); } + + bool requiresRandomSeed() const override { return false; } + + // Attributes + float getAlpha() const { return alpha; } + + private: + float alpha; +}; + +namespace { +using popart::DataType; +using popart::OpDefinition; + +static OpDefinition::DataTypes T = {DataType::FLOAT16, DataType::FLOAT}; + +static OpDefinition leakyReluOpDef({OpDefinition::Inputs({{"input", T}}), + OpDefinition::Outputs({{"output", T}}), + OpDefinition::Attributes({{"alpha", + {"*"}}})}); + +static popart::OpCreator leakyReluOpCreator( + popart::OpDefinitions({{CustomOperators::LeakyReluId, leakyReluOpDef}}), + [](const popart::OpCreatorInfo &info) { + // default alpha is 10**(-2) + float alpha = info.attributes.getAttribute( + "alpha", 1e-2f); + return std::make_unique(info.opid, alpha, info.settings); + }, + true); +} // namespace + +static popart::RegisterShapeInferenceFunction leakyReluShapeInfer( + CustomOperators::LeakyReluId, + [](popart::ShapeInferenceContext &ctx // NO_LINT + ) { ctx.outInfo(0) = ctx.inInfo(0); }); + +namespace pe = popops::expr; + +class LeakyReluOpx : public popart::popx::Opx { + public: + LeakyReluOpx(popart::Op *op, popart::popx::Devicex *devicex) + : popart::popx::Opx(op, devicex) { + verifyOp(op, {CustomOperators::LeakyReluId}); + } + + void grow(poplar::program::Sequence &prog) const final { // NOLINT + popart::logging::ir::trace("start Growing LeakyReluOpx"); + + auto op = getOp(); + + poplar::Tensor input = getInTensor(0); + + float alpha = op.getAlpha(); + + // x < 0.0f ? alpha * x : x + auto expression = pe::Select(pe::Mul(pe::Const(alpha), pe::_1), + pe::_1, + pe::Lt(pe::_1, pe::Const(0.0f))); + + popops::mapInPlace(graph(), + expression, + {input}, + prog, + debugContext("LeakyRelu"), + poplar::OptionFlags()); + + setOutTensor(0, input); + } +}; + +class LeakyReluGradOpx : public popart::popx::Opx { + public: + LeakyReluGradOpx(popart::Op *op, popart::popx::Devicex *devicex) + : popart::popx::Opx(op, devicex) { + verifyOp(op, {CustomGradOperators::LeakyReluGradId}); + } + + void grow(poplar::program::Sequence &prog) const final { // NOLINT + auto op = getOp(); + + poplar::Tensor grad = getInTensor(0); + poplar::Tensor input = getInTensor(1); + + float alpha = op.getAlpha(); + + // (grad * (x < 0.0f ? alpha : 1)) + pe::Mul expression = pe::Mul( + pe::Select( + pe::Const(alpha), pe::Const(1.0f), pe::Lt(pe::_2, pe::Const(0.0f))), + pe::_1); + + auto output = popops::map(graph(), + expression, + {grad, input}, + prog, + debugContext("LeakyReluGrad"), + poplar::OptionFlags()); + + setOutTensor(0, output); + } +}; + +LeakyReluGradOp::LeakyReluGradOp(const LeakyReluOp &fwdOp) + : popart::Op(CustomGradOperators::LeakyReluGradId, fwdOp.settings), + alpha(fwdOp.getAlpha()) {} + +const std::vector &LeakyReluGradOp::gradInputInfo() + const { + static const std::vector inInfo = { + {0, 0, popart::GradOpInType::GradOut}, {1, 0, popart::GradOpInType::In}}; + return inInfo; +} + +// The Grad Op has 1 output, which is the gradient of the only input +const std::map &LeakyReluGradOp::gradOutToNonGradIn() const { + static const std::map outInfo = {{0, 0}}; + return outInfo; +} + +void LeakyReluGradOp::appendAttributes(popart::OpSerialiserBase &os) const { + Op::appendAttributes(os); + os.appendAttribute("alpha", getAlpha()); +} + +void LeakyReluGradOp::appendOutlineAttributes( + popart::OpSerialiserBase &os) const { + Op::appendOutlineAttributes(os); + os.appendAttribute("alpha", getAlpha()); +} + +static popart::popx::OpxCreator LeakyReluOpxCreator( + {CustomOperators::LeakyReluId}); +static popart::popx::OpxCreator LeakyReluGradOpxCreator( + {CustomGradOperators::LeakyReluGradId}); diff --git a/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_leaky_relu_ipu.py b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_leaky_relu_ipu.py new file mode 100644 index 0000000000000..fb3fcbf5fe416 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/custom_ops/test_custom_leaky_relu_ipu.py @@ -0,0 +1,124 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import sys + +import numpy as np +import paddle +import paddle.optimizer +import paddle.static +from paddle.utils.cpp_extension import load + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from op_test_ipu import IPUOpTest, np_dtype_to_fluid_str + + +def load_custom_ops(): + # load custom ops + cur_dir = os.path.dirname(os.path.realpath(__file__)) + custom_ops = load(name="custom_jit_ops", + sources=[ + f"{cur_dir}/leaky_relu_cpu.cc", + f"{cur_dir}/leaky_relu_ipu.cc", + ], + extra_cxx_cflags=['-DONNX_NAMESPACE=onnx']) + return custom_ops + + +class TestBase(IPUOpTest): + + def setUp(self): + self.set_atol() + self.set_training() + self.set_feed() + self.set_feed_attr() + self.set_attrs() + + def set_feed(self): + self.feed = { + "x": np.random.uniform(low=-2, high=2, size=[3, + 5]).astype('float32'), + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + self.feed_dtype = [ + np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() + ] + + def set_attrs(self): + self.attrs = {'alpha': 0.1} + + def _test_base(self, run_ipu=True): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + SEED = self.SEED + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + custom_ops = load_custom_ops() + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data(name=self.feed_list[0], + shape=self.feed_shape[0], + dtype=self.feed_dtype[0]) + # custom op + out = custom_ops.custom_leaky_relu(x, **self.attrs) + fetch_list = [out.name] + + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if run_ipu: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=False) + + # add name mapping for paddle custom op and popart custom ops + # `paddle_op` was defined in leaky_relu_cpu.cc + # `popart_op`, `domain` and `version` was defined in leaky_relu_ipu.cc + ipu_strategy.add_custom_op(paddle_op="custom_leaky_relu", + popart_op="LeakyRelu", + domain='custom.ops', + version=1) + + program = paddle.static.IpuCompiledProgram( + main_prog, scope=scope, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + return result[0] + + def test_base(self): + res0 = self._test_base(False) + res1 = self._test_base(True) + + self.assertTrue( + np.allclose(res0.flatten(), res1.flatten(), atol=self.atol)) + + self.assertTrue(res0.shape == res1.shape) + + +if __name__ == "__main__": + unittest.main() From 270f25e91d45cb7b0d0f7fd621ba5ff68ed151ec Mon Sep 17 00:00:00 2001 From: zhangxiaoci Date: Fri, 15 Jul 2022 17:28:47 +0800 Subject: [PATCH 222/250] support KL2 multi-card training, *test=kunlun (#43889) * update xccl lib * use separate streams for compute/comm on XPU * add broadcast op to xpu2_op_list --- cmake/external/xpu.cmake | 5 +- paddle/fluid/imperative/bkcl_context.cc | 39 +++++++++-- paddle/fluid/imperative/bkcl_context.h | 8 +++ paddle/fluid/imperative/reducer.cc | 69 ++++++------------- paddle/fluid/platform/collective_helper.cc | 6 ++ .../fluid/platform/device/xpu/xpu2_op_list.h | 1 + paddle/phi/backends/all_context.h | 2 + paddle/phi/backends/xpu/xpu_context.cc | 4 ++ paddle/phi/backends/xpu/xpu_context.h | 2 + paddle/phi/core/kernel_utils.h | 2 + 10 files changed, 82 insertions(+), 56 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 3ead16451a3af..c1f8eb0e33c79 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -24,6 +24,9 @@ else() set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() +set(XPU_XCCL_BASE_URL + "https://klx-sdk-release-public.su.bcebos.com/xccl/release/1.0.0") + if(WITH_AARCH64) set(XPU_XRE_DIR_NAME "xre-kylin_aarch64") set(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64") @@ -76,7 +79,7 @@ set(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) set(XPU_XCCL_URL - "${XPU_BASE_URL_WITHOUT_DATE}/20220411/${XPU_XCCL_DIR_NAME}.tar.gz" + "${XPU_XCCL_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) set(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc index 4bef3549d26c5..831e7dae942ae 100644 --- a/paddle/fluid/imperative/bkcl_context.cc +++ b/paddle/fluid/imperative/bkcl_context.cc @@ -110,6 +110,10 @@ void BKCLParallelContext::Init() { strategy_.local_rank_, xpu_id, ring_id); + compute_events_.emplace_back( + platform::XpuEventResourcePool::Instance().New(place_.device)); + comm_events_.emplace_back( + platform::XpuEventResourcePool::Instance().New(place_.device)); } } @@ -134,6 +138,11 @@ void BKCLParallelContext::InitWithRingID(int ring_id) { // it will assign bkcl_comm in XPUDeviceContext within ring_id platform::BKCLCommContext::Instance().CreateComm( &bkcl_ids[0], strategy_.nranks_, strategy_.local_rank_, xpu_id, ring_id); + + compute_events_.emplace_back( + platform::XpuEventResourcePool::Instance().New(place_.device)); + comm_events_.emplace_back( + platform::XpuEventResourcePool::Instance().New(place_.device)); } void BKCLParallelContext::AllReduceByStream(const framework::Variable &src, @@ -213,9 +222,18 @@ void BKCLParallelContext::WaitCompute(int ring_id) { "but got ring id = %d, nrings = %d", ring_id, strategy_.nrings_)); - auto compute_dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place_)); - compute_dev_ctx->Wait(); + auto compute_stream = static_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + auto comm_stream = platform::BKCLCommContext::Instance() + .Get(ring_id, place_) + ->dev_context() + ->stream(); + auto event = compute_events_[ring_id].get(); + + // compute_stream-->event-->comm_stream + PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_record(event, compute_stream)); + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_wait_event(comm_stream, event)); } void BKCLParallelContext::WaitComm(int ring_id) { @@ -230,9 +248,18 @@ void BKCLParallelContext::WaitComm(int ring_id) { "but got ring id = %d, nrings = %d", ring_id, strategy_.nrings_)); - auto comm_dev_ctx = - platform::BKCLCommContext::Instance().Get(ring_id, place_)->dev_context(); - comm_dev_ctx->Wait(); + auto comm_stream = platform::BKCLCommContext::Instance() + .Get(ring_id, place_) + ->dev_context() + ->stream(); + auto compute_stream = static_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + auto event = compute_events_[ring_id].get(); + + // comm_stream-->event-->compute_stream + PADDLE_ENFORCE_XPU_SUCCESS(xpu_event_record(event, comm_stream)); + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_wait_event(compute_stream, event)); } void BKCLParallelContext::SynchronizeCompute() { diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h index 7ba1358959161..6a938924b9780 100644 --- a/paddle/fluid/imperative/bkcl_context.h +++ b/paddle/fluid/imperative/bkcl_context.h @@ -19,6 +19,7 @@ #include #include "paddle/fluid/imperative/parallel_context.h" +#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h" #include "xpu/bkcl.h" namespace paddle { @@ -52,6 +53,13 @@ class BKCLParallelContext : public ParallelContext { void WaitComm(int ring_id) override; void SynchronizeCompute() override; + + private: + // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id] + std::vector> compute_events_; + + // used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream + std::vector> comm_events_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 9dd61b6f5e3cd..468263e7be7ea 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -21,6 +21,9 @@ #include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" +#ifdef PADDLE_WITH_XPU_BKCL +#include "paddle/fluid/platform/device/xpu/enforce_xpu.h" +#endif #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/dense_tensor.h" namespace paddle { @@ -431,10 +434,6 @@ Reducer::Reducer(const std::vector> &vars, VLOG(3) << "Start construct the Reducer ..."; nrings_ = parallel_ctx->GetNRings(); nranks_ = parallel_ctx->GetNRanks(); -#ifdef PADDLE_WITH_XPU_BKCL - comm_pool_.reset(new ::ThreadPool(1)); - comm_op_count_ = 0; -#endif // initialize groups InitializeGroups(group_indices); for (size_t global_var_index = 0; global_var_index < vars_.size(); @@ -853,8 +852,23 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { #ifdef PADDLE_WITH_XPU_BKCL if (platform::is_xpu_place(group_tensor.place())) { - // TODO(liuyuhui) support XPU set constant - VLOG(3) << "XPU doesn't support set_constant"; + auto dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + if (HasGrad(var_index)) { + auto var_base = vars_[var_index]->GradVarBase(); + auto tensor = + var_base->MutableVar()->GetMutable(); + group_tensor.ShareDataWith(*tensor).Resize( + {static_cast(length)}); + } else { + group_tensor.Resize({static_cast(length)}); + int r = xpu::constant(dev_ctx->x_context(), + reinterpret_cast(group_tensor.data()), + group_tensor.numel(), + 0.0f); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + PADDLE_ENFORCE_XPU_SUCCESS(xpu_wait(dev_ctx->stream())); + } } #elif defined(PADDLE_WITH_CNCL) if (platform::is_mlu_place(group_tensor.place())) { @@ -948,33 +962,7 @@ void Reducer::MarkGroupReady(size_t group_index) { // so we expose WaitCompute() interface and call // it here. parallel_ctx_->WaitCompute(run_order); -#ifdef PADDLE_WITH_XPU_BKCL - { - std::lock_guard lock(mutex_); - comm_op_count_ += 1; // lock - } - // TODO(liuyuhui): Add try catch to deal with exception later, - // otherwise the main thread will continue to run when an exception is - // thrown in comm_pool_. - auto next_group = next_group_; - comm_pool_->enqueue([this, run_order, next_group, &group] { - auto dev_id = place_.device; - platform::SetXPUDeviceId(dev_id); - FusedAllReduceSchedule(run_order, group, next_group); - { - std::lock_guard lock(mutex_); - comm_op_count_ -= 1; // lock - cv_.notify_all(); - } - }); -#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ - defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \ - defined(PADDLE_WITH_CNCL) FusedAllReduceSchedule(run_order, group, next_group_); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Not compiled with BKCL or NCCL or CNCL or GLOO.")); -#endif } } @@ -997,17 +985,6 @@ void Reducer::FusedAllReduceSchedule(const int run_order, // group.dense_tensors ---> group.dense_contents_ group.ConcatTensors(dev_context); -// NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support -// default stream for communicating, so there exist some problems in -// synchronization. And need to add a WaitComm there. -// TODO(liuyuhui): If BKCL support non-blocking communication, it should be -// fixed as multi gpus card training. -#ifdef PADDLE_WITH_XPU_BKCL - if (platform::is_xpu_place(group.dense_tensors_[0].place())) { - parallel_ctx_->WaitComm(run_order); - } -#endif - group.DivNRanks(dev_context, nranks_); // Start allreduce parallel_ctx_->AllReduceByStream( @@ -1135,12 +1112,6 @@ bool Reducer::HasGrad(size_t var_index) { void Reducer::FinalizeBackward() { groups_need_finalize_ = false; grad_need_hooks_ = false; -#ifdef PADDLE_WITH_XPU_BKCL - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return comm_op_count_ == 0; }); - } -#endif // Must prevent compute_stream_ starting until all comm streams have finished for (int i = 0; i < nrings_; ++i) { diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index 655c5a98aeb51..00b5dd7f8afe9 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -347,6 +347,12 @@ BKCLComm* BKCLCommContext::AssignBKCLComm( BKCLContext_t comm, int nranks, int rank, int dev_id, int ring_id) { std::unique_ptr dev_ctx( new XPUDeviceContext(XPUPlace(dev_id))); + // used in BKCL as comm_stream, for every dev_id there is + // a comm_stream at each ring. this stream is passed as input var + // when calling collective comm commands like bkcl_all_reduce + XPUStream comm_stream; + PADDLE_ENFORCE_XPU_SUCCESS(xpu_stream_create(&comm_stream)); + dev_ctx->SetXPUStream(comm_stream); BKCLCommImpl* c = new BKCLCommImpl; c->set_ring_id(ring_id); diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index a3165bc989384..bd5957a122885 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -60,6 +60,7 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"bilinear_interp_v2_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace()), diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h index 57e6f084fd4c9..392df09fcffd8 100644 --- a/paddle/phi/backends/all_context.h +++ b/paddle/phi/backends/all_context.h @@ -23,7 +23,9 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#ifdef PADDLE_WITH_XPU #include "paddle/phi/backends/xpu/xpu_context.h" +#endif #ifndef PADDLE_WITH_CUSTOM_KERNEL // TODO(wilber): DeviceContextPool nees include fluid file. diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc index e73aa30c8d85b..fe0dda2d3dbeb 100644 --- a/paddle/phi/backends/xpu/xpu_context.cc +++ b/paddle/phi/backends/xpu/xpu_context.cc @@ -66,6 +66,8 @@ struct XPUContext::Impl { const Place& GetPlace() const { return place_; } + void SetStream(XPUStream stream) { context_->xpu_stream = stream; } + xpu::Context* GetXContext() const { PD_CHECK(context_ != nullptr, "the xpu context is nullptr."); return context_; @@ -115,6 +117,8 @@ XPUContext::~XPUContext() = default; const Place& XPUContext::GetPlace() const { return impl_->GetPlace(); } +void XPUContext::SetXPUStream(XPUStream stream) { impl_->SetStream(stream); } + backends::xpu::XPUVersion XPUContext::xpu_version() const { return impl_->xpu_version_; } diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index d39b3c9cc1ff7..d20a1ad4e1e48 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -61,6 +61,8 @@ class XPUContext : public DeviceContext { void SetL3Cache(int l3_size = 14155776); + void SetXPUStream(XPUStream stream); + private: struct Impl; std::unique_ptr impl_; diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 73814fc3f4048..9206acfd51542 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -18,7 +18,9 @@ #include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/onednn/onednn_context.h" +#ifdef PADDLE_WITH_XPU #include "paddle/phi/backends/xpu/xpu_context.h" +#endif #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/dense_tensor.h" From 0dafbb03b8f564e01844951345aa7e5986a2f4eb Mon Sep 17 00:00:00 2001 From: zyfncg Date: Fri, 15 Jul 2022 18:01:29 +0800 Subject: [PATCH 223/250] Remove auto to_pascal_case for args in op generator (#44350) * remove auto to_pascal_case for args in op generator * fix yaml config --- .pre-commit-config.yaml | 2 +- paddle/phi/api/yaml/api_compat.yaml | 82 ++++++++++++++++--- paddle/phi/api/yaml/generator/filters.py | 4 +- .../generator/templates/operator_utils.c.j2 | 10 +-- 4 files changed, 79 insertions(+), 19 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5c6bded87ce4a..77bf882a312f9 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,7 +24,7 @@ repos: files: (?!.*third_party)^.*$ | (?!.*book)^.*$ - id: end-of-file-fixer - id: sort-simple-yaml - files: (api|backward)\.yaml$ + files: (api|backward|api_[a-z_]+)\.yaml$ - repo: local hooks: - id: clang-format diff --git a/paddle/phi/api/yaml/api_compat.yaml b/paddle/phi/api/yaml/api_compat.yaml index 873d735c5df83..a68de3a0f106b 100644 --- a/paddle/phi/api/yaml/api_compat.yaml +++ b/paddle/phi/api/yaml/api_compat.yaml @@ -1,12 +1,39 @@ - api : atan2 inputs : - x : X1 - y : X2 + {x : X1, y : X2} outputs : out : Out +- api : bernoulli + inputs : + x : X + outputs : + out : Out + +- api : cholesky + inputs : + x : X + outputs : + out : Out + +- api : cholesky_solve + inputs : + {x : X, y : Y} + outputs : + out : Out + +- api : conv2d + extra : + attrs : [bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, + bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false, + str fuse_activation = "", bool fuse_alpha = false, bool fuse_beta = false, bool use_addto = false, + bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f, + float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false, + int workspace_size_MB = 512, bool exhaustive_search = false] + - api : cross - inputs : {x : X, y : Y} + inputs : + {x : X, y : Y} attrs : axis : dim outputs : @@ -26,17 +53,50 @@ outputs : out : Out +- api : digamma + inputs : + x : X + outputs : + out : Out + +- api : dist + inputs : + {x : X, y : Y} + outputs : + out : Out + +- api : dot + inputs : + {x : X, y : Y} + outputs : + out : Out + +- api : erf + inputs : + x : X + outputs : + out : Out + +- api : mv + inputs : + {x : X, vec : Vec} + outputs : + out : Out + +- api : poisson + inputs : + x : X + outputs : + out : Out + - api : trace inputs : x : Input outputs : out : Out -- api : conv2d - extra : - attrs : [bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false, - bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false, - str fuse_activation = "", bool fuse_alpha = false, bool fuse_beta = false, bool use_addto = false, - bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f, - float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false, - int workspace_size_MB = 512, bool exhaustive_search = false] +- api : trunc + inputs : + x : X + outputs : + out : Out diff --git a/paddle/phi/api/yaml/generator/filters.py b/paddle/phi/api/yaml/generator/filters.py index d978293fe6f73..cda858ab6e74e 100644 --- a/paddle/phi/api/yaml/generator/filters.py +++ b/paddle/phi/api/yaml/generator/filters.py @@ -79,9 +79,9 @@ def to_sr_output_type(s): # -------------- transform argument names from yaml to opmaker ------------ def to_opmaker_name(s): if s.endswith("_grad"): - return 'GradVarName("{}")'.format(to_pascal_case(s[:-5])) + return 'GradVarName("{}")'.format(s[:-5]) else: - return '"{}"'.format(to_pascal_case(s)) + return '"{}"'.format(s) def to_opmaker_name_cstr(s): diff --git a/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 index 841de704579d3..0e684664c4884 100644 --- a/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 +++ b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 @@ -358,15 +358,15 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker input_orig_names, output_orig_names) %}{# inline #} {% if name in input_names %} {% set name_in_forward_orig = input_orig_names[input_names.index(name)]%} -Input("{{name_in_forward_orig | to_pascal_case}}") +Input("{{name_in_forward_orig}}") {%- elif name in output_names %} {% set name_in_forward_orig = output_orig_names[output_names.index(name)]%} -Output("{{name | to_pascal_case}}") +Output("{{name}}") {%- elif name.endswith("_grad") %}{# output grad#} {% set name_in_forward = name[:-5] %} {% if name_in_forward in output_names %} {% set name_in_forward_orig = output_orig_names[output_names.index(name_in_forward)] %} -OutputGrad("{{name_in_forward_orig | to_pascal_case}}") +OutputGrad("{{name_in_forward_orig}}") {%- endif %} {%- endif %} {%- endmacro %} @@ -376,11 +376,11 @@ OutputGrad("{{name_in_forward_orig | to_pascal_case}}") {% if name[:-5] in input_names %} {% set name_in_forward = name[:-5] %} {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%} -InputGrad("{{name[:-5] | to_pascal_case}}") +InputGrad("{{name[:-5]}}") {%- elif (name | to_input_name) in input_names %} {% set name_in_forward = name | to_input_name %} {% set name_in_forward_orig = input_orig_names[input_names.index(name_in_forward)]%} -InputGrad("{{name | to_input_name | to_pascal_case}}") +InputGrad("{{name | to_input_name}}") {%- endif %} {%- endmacro %} From 874438315e2934bc20cd4fb169f2d79057097963 Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Fri, 15 Jul 2022 18:16:54 +0800 Subject: [PATCH 224/250] Standard sparse conv name (#44353) --- paddle/phi/api/yaml/sparse_api.yaml | 2 +- paddle/phi/api/yaml/sparse_bw_api.yaml | 2 +- ...ution_grad_kernel.h => conv_grad_kernel.h} | 51 ++++++------ paddle/phi/kernels/sparse/conv_kernel.h | 62 ++++++++++++++ .../phi/kernels/sparse/convolution_kernel.h | 62 -------------- ...ion_grad_kernel.cc => conv_grad_kernel.cc} | 80 +++++++++---------- .../{convolution_kernel.cc => conv_kernel.cc} | 64 +++++++-------- paddle/phi/kernels/sparse/cpu/convolution.h | 2 +- ...ion_grad_kernel.cu => conv_grad_kernel.cu} | 80 +++++++++---------- .../{convolution_kernel.cu => conv_kernel.cu} | 68 ++++++++-------- .../phi/kernels/sparse/gpu/convolution.cu.h | 2 +- paddle/phi/tests/api/test_sparse_conv_api.cc | 2 +- .../kernels/test_sparse_conv3d_dev_api.cc | 80 +++++++++---------- 13 files changed, 278 insertions(+), 279 deletions(-) rename paddle/phi/kernels/sparse/{convolution_grad_kernel.h => conv_grad_kernel.h} (53%) create mode 100644 paddle/phi/kernels/sparse/conv_kernel.h delete mode 100644 paddle/phi/kernels/sparse/convolution_kernel.h rename paddle/phi/kernels/sparse/cpu/{convolution_grad_kernel.cc => conv_grad_kernel.cc} (80%) rename paddle/phi/kernels/sparse/cpu/{convolution_kernel.cc => conv_kernel.cc} (83%) rename paddle/phi/kernels/sparse/gpu/{convolution_grad_kernel.cu => conv_grad_kernel.cu} (84%) rename paddle/phi/kernels/sparse/gpu/{convolution_kernel.cu => conv_kernel.cu} (86%) diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index 4c513ed7d2edd..e32ce5b21540b 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -84,7 +84,7 @@ args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) output : Tensor(out), Tensor(rulebook) kernel : - func : sparse_conv3d{sparse_coo, dense -> sparse_coo, dense} + func : conv3d_coo{sparse_coo, dense -> sparse_coo, dense} layout : x intermediate : rulebook backward : conv3d_grad diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml index 220d45cadcb06..6e3a82a22bcfc 100644 --- a/paddle/phi/api/yaml/sparse_bw_api.yaml +++ b/paddle/phi/api/yaml/sparse_bw_api.yaml @@ -76,7 +76,7 @@ args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm) output : Tensor(x_grad), Tensor(kernel_grad) kernel : - func : sparse_conv3d_grad{sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense} + func : conv3d_coo_grad{sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense} - backward_api : coo_to_dense_grad forward : coo_to_dense(Tensor x) -> Tensor(out) diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/conv_grad_kernel.h similarity index 53% rename from paddle/phi/kernels/sparse/convolution_grad_kernel.h rename to paddle/phi/kernels/sparse/conv_grad_kernel.h index eebfcddfc7a9e..205823e620375 100644 --- a/paddle/phi/kernels/sparse/convolution_grad_kernel.h +++ b/paddle/phi/kernels/sparse/conv_grad_kernel.h @@ -17,27 +17,26 @@ limitations under the License. */ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/sparse/convolution_kernel.h" namespace phi { namespace sparse { template -void Conv3dGradKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const DenseTensor& rulebook, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - SparseCooTensor* x_grad, - DenseTensor* kernel_grad); +void Conv3dCooGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const DenseTensor& rulebook, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + SparseCooTensor* x_grad, + DenseTensor* kernel_grad); template -std::tuple Conv3dGrad( +std::tuple Conv3dCooGrad( const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& kernel, @@ -52,18 +51,18 @@ std::tuple Conv3dGrad( DenseTensor kernel_grad; // TODO(zhangkaihuo): call InferMeta func here - Conv3dGradKernel(dev_ctx, - x, - kernel, - rulebook, - out_grad, - paddings, - dilations, - strides, - groups, - subm, - &x_grad, - &kernel_grad); + Conv3dCooGradKernel(dev_ctx, + x, + kernel, + rulebook, + out_grad, + paddings, + dilations, + strides, + groups, + subm, + &x_grad, + &kernel_grad); return std::make_tuple(x_grad, kernel_grad); } diff --git a/paddle/phi/kernels/sparse/conv_kernel.h b/paddle/phi/kernels/sparse/conv_kernel.h new file mode 100644 index 0000000000000..fbff46d4390ba --- /dev/null +++ b/paddle/phi/kernels/sparse/conv_kernel.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/sparse/convolution.h" + +namespace phi { +namespace sparse { + +template +void Conv3dCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + SparseCooTensor* out, + DenseTensor* rulebook); + +template +SparseCooTensor Conv3dCoo(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + DenseTensor* rulebook) { + SparseCooTensor coo; + Conv3dCooKernel(dev_ctx, + x, + kernel, + paddings, + dilations, + strides, + groups, + subm, + &coo, + rulebook); + return coo; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/convolution_kernel.h b/paddle/phi/kernels/sparse/convolution_kernel.h deleted file mode 100644 index 62a72a9dd4115..0000000000000 --- a/paddle/phi/kernels/sparse/convolution_kernel.h +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/sparse_coo_tensor.h" -#include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/funcs/sparse/convolution.h" - -namespace phi { -namespace sparse { - -template -void Conv3dKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - SparseCooTensor* out, - DenseTensor* rulebook); - -template -SparseCooTensor Conv3d(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - DenseTensor* rulebook) { - SparseCooTensor coo; - Conv3dKernel(dev_ctx, - x, - kernel, - paddings, - dilations, - strides, - groups, - subm, - &coo, - rulebook); - return coo; -} - -} // namespace sparse -} // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc similarity index 80% rename from paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc rename to paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc index a675853ac47c1..a8f4441eae897 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/conv_grad_kernel.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" +#include "paddle/phi/kernels/sparse/conv_grad_kernel.h" #include "paddle/phi/core/visit_type.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -31,18 +31,18 @@ namespace sparse { // x_grad = out_grad * transpose(kenrel) // kernel_grad = transpose(x) * out_grad template -void Conv3dGradCPUKernel(const CPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const DenseTensor& rulebook, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - SparseCooTensor* x_grad, - DenseTensor* kernel_grad) { +void Conv3dCooGradCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const DenseTensor& rulebook, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + SparseCooTensor* x_grad, + DenseTensor* kernel_grad) { const auto& kernel_dims = kernel.dims(); const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; const int in_channels = kernel_dims[3]; @@ -178,42 +178,42 @@ void Conv3dGradCPUKernel(const CPUContext& dev_ctx, } template -void Conv3dGradKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const DenseTensor& rulebook, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - SparseCooTensor* x_grad, - DenseTensor* kernel_grad) { +void Conv3dCooGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const DenseTensor& rulebook, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + SparseCooTensor* x_grad, + DenseTensor* kernel_grad) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "Conv3dGradCPUKernel", ([&] { - Conv3dGradCPUKernel(dev_ctx, - x, - kernel, - rulebook, - out_grad, - paddings, - dilations, - strides, - groups, - subm, - x_grad, - kernel_grad); + x.non_zero_indices().dtype(), "Conv3dCooGradCPUKernel", ([&] { + Conv3dCooGradCPUKernel(dev_ctx, + x, + kernel, + rulebook, + out_grad, + paddings, + dilations, + strides, + groups, + subm, + x_grad, + kernel_grad); })); } } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sparse_conv3d_grad, +PD_REGISTER_KERNEL(conv3d_coo_grad, CPU, ALL_LAYOUT, - phi::sparse::Conv3dGradKernel, + phi::sparse::Conv3dCooGradKernel, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc similarity index 83% rename from paddle/phi/kernels/sparse/cpu/convolution_kernel.cc rename to paddle/phi/kernels/sparse/cpu/conv_kernel.cc index 1b95de890deeb..7147a29a9c832 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/conv_kernel.cc @@ -27,16 +27,16 @@ namespace sparse { * out: (N, D, H, W, OC) **/ template -void Conv3dCPUKernel(const CPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - SparseCooTensor* out, - DenseTensor* rulebook) { +void Conv3dCooCPUKernel(const CPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + SparseCooTensor* out, + DenseTensor* rulebook) { // update padding and dilation // Currently, only support x.layout is NDHWC, groups = 1 // if x.layout != NDHWC then transpose(x), transpose(weight) @@ -151,28 +151,28 @@ void Conv3dCPUKernel(const CPUContext& dev_ctx, } template -void Conv3dKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - SparseCooTensor* out, - DenseTensor* rulebook) { +void Conv3dCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + SparseCooTensor* out, + DenseTensor* rulebook) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] { - Conv3dCPUKernel(dev_ctx, - x, - kernel, - paddings, - dilations, - strides, - groups, - subm, - out, - rulebook); + x.non_zero_indices().dtype(), "Conv3dCooCPUKernel", ([&] { + Conv3dCooCPUKernel(dev_ctx, + x, + kernel, + paddings, + dilations, + strides, + groups, + subm, + out, + rulebook); })); } @@ -180,6 +180,6 @@ void Conv3dKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - sparse_conv3d, CPU, ALL_LAYOUT, phi::sparse::Conv3dKernel, float, double) { + conv3d_coo, CPU, ALL_LAYOUT, phi::sparse::Conv3dCooKernel, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); } diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index b2544619774c2..373087ade272b 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/sparse/conv_kernel.h" namespace phi { namespace sparse { diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu similarity index 84% rename from paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu rename to paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu index 1f82f2ff93e96..0ce3558e1d73f 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" +#include "paddle/phi/kernels/sparse/conv_grad_kernel.h" #include "glog/logging.h" #include "paddle/phi/backends/gpu/gpu_context.h" @@ -39,18 +39,18 @@ namespace sparse { // x_grad = out_grad * transpose(kenrel) // kernel_grad = transpose(x) * out_grad template -void Conv3dGradGPUKernel(const GPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const DenseTensor& rulebook, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - SparseCooTensor* x_grad, - DenseTensor* kernel_grad) { +void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const DenseTensor& rulebook, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + SparseCooTensor* x_grad, + DenseTensor* kernel_grad) { const auto& kernel_dims = kernel.dims(); const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; const int in_channels = kernel_dims[3]; @@ -220,42 +220,42 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx, } template -void Conv3dGradKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const DenseTensor& rulebook, - const SparseCooTensor& out_grad, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - SparseCooTensor* x_grad, - DenseTensor* kernel_grad) { +void Conv3dCooGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const DenseTensor& rulebook, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + SparseCooTensor* x_grad, + DenseTensor* kernel_grad) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "Conv3dGradGPUKernel", ([&] { - Conv3dGradGPUKernel(dev_ctx, - x, - kernel, - rulebook, - out_grad, - paddings, - dilations, - strides, - groups, - subm, - x_grad, - kernel_grad); + x.non_zero_indices().dtype(), "Conv3dCooGradGPUKernel", ([&] { + Conv3dCooGradGPUKernel(dev_ctx, + x, + kernel, + rulebook, + out_grad, + paddings, + dilations, + strides, + groups, + subm, + x_grad, + kernel_grad); })); } } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sparse_conv3d_grad, +PD_REGISTER_KERNEL(conv3d_coo_grad, GPU, ALL_LAYOUT, - phi::sparse::Conv3dGradKernel, + phi::sparse::Conv3dCooGradKernel, float, double, phi::dtype::float16) { diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu similarity index 86% rename from paddle/phi/kernels/sparse/gpu/convolution_kernel.cu rename to paddle/phi/kernels/sparse/gpu/conv_kernel.cu index fe66fb5cff9de..6820b677147f3 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/sparse/conv_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" @@ -27,16 +27,16 @@ namespace phi { namespace sparse { template -void Conv3dGPUKernel(const GPUContext& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - SparseCooTensor* out, - DenseTensor* rulebook) { +void Conv3dCooGPUKernel(const GPUContext& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + SparseCooTensor* out, + DenseTensor* rulebook) { // update padding and dilation // Currently, only support x.layout is NDHWC, groups = 1 // if x.layout != NDHWC then transpose(x), transpose(weight) @@ -190,38 +190,38 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx, * out: (N, D, H, W, OC) **/ template -void Conv3dKernel(const Context& dev_ctx, - const SparseCooTensor& x, - const DenseTensor& kernel, - const std::vector& paddings, - const std::vector& dilations, - const std::vector& strides, - const int groups, - const bool subm, - SparseCooTensor* out, - DenseTensor* rulebook) { +void Conv3dCooKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& kernel, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + const bool subm, + SparseCooTensor* out, + DenseTensor* rulebook) { PD_VISIT_INTEGRAL_TYPES( - x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] { - Conv3dGPUKernel(dev_ctx, - x, - kernel, - paddings, - dilations, - strides, - groups, - subm, - out, - rulebook); + x.non_zero_indices().dtype(), "Conv3dCooGPUKernel", ([&] { + Conv3dCooGPUKernel(dev_ctx, + x, + kernel, + paddings, + dilations, + strides, + groups, + subm, + out, + rulebook); })); } } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(sparse_conv3d, +PD_REGISTER_KERNEL(conv3d_coo, GPU, ALL_LAYOUT, - phi::sparse::Conv3dKernel, + phi::sparse::Conv3dCooKernel, float, double, phi::dtype::float16) { diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h index d56575cddbfe2..2591d24bfe443 100644 --- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h +++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h @@ -28,7 +28,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/sparse/utils.cu.h" #include "paddle/phi/kernels/primitive/compute_primitives.h" -#include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/sparse/conv_kernel.h" namespace phi { namespace sparse { diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc index bbdb2f70d7fd3..95f4afe4d1540 100644 --- a/paddle/phi/tests/api/test_sparse_conv_api.cc +++ b/paddle/phi/tests/api/test_sparse_conv_api.cc @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/sparse_coo_tensor.h" -PD_DECLARE_KERNEL(sparse_conv3d, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(conv3d_coo, CPU, ALL_LAYOUT); template void TestConv3dBase(const std::vector& indices, diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 2efdd47998073..4a39f2bd8f1c4 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -23,8 +23,8 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/sparse/coalesce_kernel.h" -#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" -#include "paddle/phi/kernels/sparse/convolution_kernel.h" +#include "paddle/phi/kernels/sparse/conv_grad_kernel.h" +#include "paddle/phi/kernels/sparse/conv_kernel.h" namespace phi { namespace tests { @@ -114,15 +114,15 @@ void TestConv3dBase(const std::vector& indices, if (!std::is_same::value) { DenseTensor rulebook = phi::Empty( dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW)); - SparseCooTensor out = sparse::Conv3d(dev_ctx_cpu, - x_tensor, - kernel_tensor, - paddings, - dilations, - strides, - 1, - subm, - &rulebook); + SparseCooTensor out = sparse::Conv3dCoo(dev_ctx_cpu, + x_tensor, + kernel_tensor, + paddings, + dilations, + strides, + 1, + subm, + &rulebook); ASSERT_EQ(correct_out_dims.size(), out.dims().size()); for (int i = 0; i < correct_out_dims.size(); i++) { @@ -139,16 +139,16 @@ void TestConv3dBase(const std::vector& indices, if (backward) { std::tuple grads = - sparse::Conv3dGrad(dev_ctx_cpu, - x_tensor, - kernel_tensor, - rulebook, - out, - paddings, - dilations, - strides, - 1, - subm); + sparse::Conv3dCooGrad(dev_ctx_cpu, + x_tensor, + kernel_tensor, + rulebook, + out, + paddings, + dilations, + strides, + 1, + subm); f_verify(std::get<0>(grads).non_zero_elements().data(), features_grad); f_verify(std::get<1>(grads).data(), kernel_grad); } @@ -198,15 +198,15 @@ void TestConv3dBase(const std::vector& indices, DenseTensor d_rulebook = phi::Empty( dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW)); - SparseCooTensor d_out = sparse::Conv3d(dev_ctx_gpu, - d_x_tensor, - d_kernel_tensor, - paddings, - dilations, - strides, - 1, - subm, - &d_rulebook); + SparseCooTensor d_out = sparse::Conv3dCoo(dev_ctx_gpu, + d_x_tensor, + d_kernel_tensor, + paddings, + dilations, + strides, + 1, + subm, + &d_rulebook); SparseCooTensor tmp_d_out = sparse::Coalesce(dev_ctx_gpu, d_out); @@ -242,16 +242,16 @@ void TestConv3dBase(const std::vector& indices, if (backward) { std::tuple grads = - sparse::Conv3dGrad(dev_ctx_gpu, - d_x_tensor, - d_kernel_tensor, - d_rulebook, - d_out, - paddings, - dilations, - strides, - 1, - subm); + sparse::Conv3dCooGrad(dev_ctx_gpu, + d_x_tensor, + d_kernel_tensor, + d_rulebook, + d_out, + paddings, + dilations, + strides, + 1, + subm); DenseTensor d_features_grad = std::get<0>(grads).non_zero_elements(); DenseTensor d_kernel_grad = std::get<1>(grads); DenseTensor h_features_grad = From 13d01e6e27e8673967b54fa32317f2bc5528efac Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Fri, 15 Jul 2022 18:30:00 +0800 Subject: [PATCH 225/250] [Eager] eager variable back sync (#44343) * eager variable back sync --- paddle/fluid/eager/eager_tensor.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h index d61a55b6dea88..8026b8e368478 100644 --- a/paddle/fluid/eager/eager_tensor.h +++ b/paddle/fluid/eager/eager_tensor.h @@ -209,6 +209,7 @@ class EagerVariable final { if (tensor.defined()) { if (tensor.is_dense_tensor()) { ConstructVariableFromTensor(tensor); + src_tensor_ = tensor.impl(); } else if (tensor.is_selected_rows()) { ConstructVariableFromTensor(tensor); } else if (IsVariableCompatTensor(tensor) && @@ -229,6 +230,19 @@ class EagerVariable final { } } + ~EagerVariable() { + if (src_tensor_) { + auto* framework_tensor = var_.GetMutable(); + auto tensor_dense = static_cast(src_tensor_.get()); + if (framework_tensor->memory_size() > 0 && + (!paddle::platform::is_same_place(framework_tensor->place(), + tensor_dense->place()) || + framework_tensor->dtype() != tensor_dense->dtype())) { + tensor_dense->ShareBufferWith(*framework_tensor); + } + } + } + /** Part 11: Construct paddle::framework::Variable with phi::Tensor **/ std::shared_ptr GetTensorBase() { // Construct allocation only once. @@ -304,5 +318,6 @@ class EagerVariable final { private: std::string name_{""}; paddle::framework::Variable var_; + std::shared_ptr src_tensor_; }; } // namespace egr From 068f48d86456dd4ee8e6ca88f04496bcebbe94be Mon Sep 17 00:00:00 2001 From: xiongkun Date: Fri, 15 Jul 2022 19:11:26 +0800 Subject: [PATCH 226/250] [ Phi Kernel ] Transfer as_real to phi. (#44263) * transfer as_real to phi * fix erros * blocking: True -> False --- paddle/fluid/operators/complex_view_op.cc | 21 ++++------- paddle/fluid/operators/complex_view_op.cu | 5 --- paddle/fluid/operators/complex_view_op.h | 15 -------- paddle/phi/api/yaml/legacy_api.yaml | 9 +++++ paddle/phi/infermeta/unary.cc | 8 +++++ paddle/phi/infermeta/unary.h | 2 ++ paddle/phi/kernels/as_real_kernel.h | 26 ++++++++++++++ paddle/phi/kernels/cpu/as_real_kernel.cc | 22 ++++++++++++ paddle/phi/kernels/gpu/as_real_kernel.cu | 22 ++++++++++++ paddle/phi/kernels/impl/as_real_impl.h | 35 +++++++++++++++++++ .../tests/unittests/test_complex_view_op.py | 1 + 11 files changed, 131 insertions(+), 35 deletions(-) create mode 100644 paddle/phi/kernels/as_real_kernel.h create mode 100644 paddle/phi/kernels/cpu/as_real_kernel.cc create mode 100644 paddle/phi/kernels/gpu/as_real_kernel.cu create mode 100644 paddle/phi/kernels/impl/as_real_impl.h diff --git a/paddle/fluid/operators/complex_view_op.cc b/paddle/fluid/operators/complex_view_op.cc index 6bdd2b48c4503..ce46a0f0121e6 100644 --- a/paddle/fluid/operators/complex_view_op.cc +++ b/paddle/fluid/operators/complex_view_op.cc @@ -20,7 +20,9 @@ #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -94,17 +96,6 @@ class AsRealOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "as_real"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "as_real"); - - auto out_dims_v = phi::vectorize(ctx->GetInputDim("X")); - out_dims_v.push_back(2); - const framework::DDim out_dims = phi::make_ddim(out_dims_v); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -148,6 +139,9 @@ class AsRealGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(as_real, + AsRealInferShapeFunctor, + PD_INFER_META(phi::AsRealInferMeta)); REGISTER_OPERATOR(as_complex, ops::AsComplexOp, @@ -158,13 +152,10 @@ REGISTER_OPERATOR(as_complex, REGISTER_OPERATOR(as_real, ops::AsRealOp, ops::AsRealOpMaker, + AsRealInferShapeFunctor, ops::AsRealGradMaker, ops::AsRealGradMaker); REGISTER_OP_CPU_KERNEL(as_complex, ops::AsComplexKernel, ops::AsComplexKernel); - -REGISTER_OP_CPU_KERNEL(as_real, - ops::AsRealKernel, - ops::AsRealKernel); diff --git a/paddle/fluid/operators/complex_view_op.cu b/paddle/fluid/operators/complex_view_op.cu index 18d448fb75d3c..eb10781491346 100644 --- a/paddle/fluid/operators/complex_view_op.cu +++ b/paddle/fluid/operators/complex_view_op.cu @@ -22,8 +22,3 @@ REGISTER_OP_CUDA_KERNEL( as_complex, ops::AsComplexKernel, ops::AsComplexKernel); - -REGISTER_OP_CUDA_KERNEL( - as_real, - ops::AsRealKernel, - ops::AsRealKernel); diff --git a/paddle/fluid/operators/complex_view_op.h b/paddle/fluid/operators/complex_view_op.h index 51abaa88f856e..169b8b05a554e 100644 --- a/paddle/fluid/operators/complex_view_op.h +++ b/paddle/fluid/operators/complex_view_op.h @@ -41,20 +41,5 @@ class AsComplexKernel : public framework::OpKernel { } }; -template -class AsRealKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* x = context.Input("X"); - auto* out = context.Output("Out"); - - out->mutable_data(context.GetPlace()); - const framework::DDim out_dims_original = out->dims(); - framework::TensorCopy(*x, context.GetPlace(), out); - out->Resize(out_dims_original); // restored the shape - out->mutable_data(context.GetPlace()); // restore the dtype - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index ab82ce9473e3c..b9e7361abea7d 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -167,6 +167,15 @@ func : argsort backward : argsort_grad +- api : as_real + args : (Tensor x) + output : Tensor + infer_meta : + func : AsRealInferMeta + kernel : + func : as_real +# backward : as_complex + # asin - api : asin args : (Tensor x) diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index f6e3b0d72474a..02f812f9b17c0 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -148,6 +148,14 @@ void ArgsortInferMeta(const MetaTensor& input, indices->share_lod(input); } +void AsRealInferMeta(const MetaTensor& input, MetaTensor* output) { + auto out_dims_v = phi::vectorize(input.dims()); + out_dims_v.push_back(2); + auto out_dims = phi::make_ddim(out_dims_v); + output->set_dims(out_dims); + output->share_lod(input); +} + void BatchSizeLikeInferMeta(const MetaTensor& x, const std::vector& shape, int x_batch_size_dim, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index fc36e1d4f85b6..30db8dcae9882 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -48,6 +48,8 @@ void ArgsortInferMeta(const MetaTensor& input, MetaTensor* output, MetaTensor* indices); +void AsRealInferMeta(const MetaTensor& input, MetaTensor* output); + void BatchSizeLikeInferMeta(const MetaTensor& x, const std::vector& shape, int x_batch_size_dim, diff --git a/paddle/phi/kernels/as_real_kernel.h b/paddle/phi/kernels/as_real_kernel.h new file mode 100644 index 0000000000000..e600f3ce39a6d --- /dev/null +++ b/paddle/phi/kernels/as_real_kernel.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AsRealKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/as_real_kernel.cc b/paddle/phi/kernels/cpu/as_real_kernel.cc new file mode 100644 index 0000000000000..c4f6ec87af414 --- /dev/null +++ b/paddle/phi/kernels/cpu/as_real_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/as_real_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/as_real_impl.h" + +PD_REGISTER_KERNEL(as_real, CPU, ALL_LAYOUT, phi::AsRealKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/as_real_kernel.cu b/paddle/phi/kernels/gpu/as_real_kernel.cu new file mode 100644 index 0000000000000..63227e7f0b1d8 --- /dev/null +++ b/paddle/phi/kernels/gpu/as_real_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/as_real_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/as_real_impl.h" + +PD_REGISTER_KERNEL(as_real, GPU, ALL_LAYOUT, phi::AsRealKernel, float, double) { +} diff --git a/paddle/phi/kernels/impl/as_real_impl.h b/paddle/phi/kernels/impl/as_real_impl.h new file mode 100644 index 0000000000000..0534b836e3732 --- /dev/null +++ b/paddle/phi/kernels/impl/as_real_impl.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/as_real_kernel.h" + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +namespace phi { +template +void AsRealKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + ctx.template Alloc(out); + auto out_dims_original = out->dims(); + Copy(ctx, x, ctx.GetPlace(), false, out); + out->Resize(out_dims_original); // restored the shape. + out->set_type( + paddle::experimental::CppTypeToDataType::Type()); // restored the + // dtype. +} + +} // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_complex_view_op.py b/python/paddle/fluid/tests/unittests/test_complex_view_op.py index 6b224209edcc5..a2fd77bcabf9e 100644 --- a/python/paddle/fluid/tests/unittests/test_complex_view_op.py +++ b/python/paddle/fluid/tests/unittests/test_complex_view_op.py @@ -67,6 +67,7 @@ def setUp(self): out_ref = ref_view_as_real(x) self.inputs = {'X': x} self.outputs = {'Out': out_ref} + self.python_api = paddle.as_real self.out_grad = np.ones([10, 10, 2], dtype="float64") def test_check_output(self): From 4a4a0369a9261ac6ce93db7fec098d1e6be4e196 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 15 Jul 2022 21:34:42 +0800 Subject: [PATCH 227/250] [Eager]Fix assert statement (#43492) --- python/paddle/tensor/manipulation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 8d2bfa2a2cb64..6c4b1cd22b0ef 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -2853,8 +2853,7 @@ def tile(x, repeat_times, name=None): """ if in_dygraph_mode(): if isinstance(repeat_times, core.eager.Tensor): - assert (repeat_times.ndim == 1, - "Only support ndim == 1 while repeat_times is a Tensor.") + assert repeat_times.ndim == 1, "Only support ndim == 1 while repeat_times is a Tensor." repeat_times = repeat_times.numpy().tolist() return _C_ops.final_state_tile(x, repeat_times) From 6f7550e473c609fae780e84f321dd4dd87e2bbed Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 15 Jul 2022 11:13:45 -0500 Subject: [PATCH 228/250] Not rename pb file to avoid re-compile (#44370) --- paddle/fluid/distributed/CMakeLists.txt | 5 ----- paddle/fluid/distributed/common/afs_warpper.h | 2 +- paddle/fluid/distributed/ps/service/ps_client.h | 2 +- .../distributed/ps/service/ps_service/graph_py_service.h | 2 +- paddle/fluid/distributed/ps/service/ps_service/service.h | 2 +- paddle/fluid/distributed/ps/service/server.h | 2 +- paddle/fluid/distributed/ps/table/accessor.h | 2 +- paddle/fluid/distributed/ps/table/ctr_accessor.h | 2 +- paddle/fluid/distributed/ps/table/ctr_double_accessor.h | 2 +- paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h | 2 +- paddle/fluid/distributed/ps/table/sparse_accessor.h | 2 +- paddle/fluid/distributed/ps/table/sparse_sgd_rule.h | 2 +- paddle/fluid/distributed/ps/table/tensor_accessor.h | 2 +- paddle/fluid/distributed/test/barrier_table_test.cc | 2 +- .../fluid/distributed/test/brpc_service_sparse_sgd_test.cc | 2 +- paddle/fluid/distributed/test/ctr_accessor_test.cc | 2 +- paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc | 2 +- paddle/fluid/distributed/test/dense_table_test.cc | 2 +- paddle/fluid/distributed/test/graph_node_split_test.cc | 2 +- paddle/fluid/distributed/test/graph_node_test.cc | 2 +- paddle/fluid/distributed/test/graph_table_sample_test.cc | 2 +- paddle/fluid/distributed/test/memory_geo_table_test.cc | 2 +- paddle/fluid/distributed/test/memory_sparse_table_test.cc | 2 +- paddle/fluid/distributed/test/sparse_sgd_rule_test.cc | 2 +- paddle/fluid/distributed/test/table_test.cc | 2 +- paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu | 2 +- paddle/fluid/framework/trainer.h | 2 +- 27 files changed, 26 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 0b5f608122683..24e0a8c7a5d9f 100755 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -33,11 +33,6 @@ if(NOT WITH_PSCORE) endif() proto_library(ps_framework_proto SRCS the_one_ps.proto) -add_custom_command( - TARGET ps_framework_proto - POST_BUILD - COMMAND mv the_one_ps.pb.h ps.pb.h - COMMAND mv the_one_ps.pb.cc ps.pb.cc) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-error=unused-value -Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result" diff --git a/paddle/fluid/distributed/common/afs_warpper.h b/paddle/fluid/distributed/common/afs_warpper.h index f9e8118514670..542d65d7a649f 100644 --- a/paddle/fluid/distributed/common/afs_warpper.h +++ b/paddle/fluid/distributed/common/afs_warpper.h @@ -20,7 +20,7 @@ #include #include -#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" #include "paddle/fluid/string/string_helper.h" namespace paddle { diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h index 1e680345b7b49..01bf29b429191 100644 --- a/paddle/fluid/distributed/ps/service/ps_client.h +++ b/paddle/fluid/distributed/ps/service/ps_client.h @@ -22,11 +22,11 @@ #include #include "paddle/fluid/distributed/common/cost_timer.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/env.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" #include "paddle/fluid/platform/timer.h" namespace paddle { diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h index b09ee358af010..4e915ab50fe86 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h +++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h @@ -27,12 +27,12 @@ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/env.h" #include "paddle/fluid/distributed/ps/service/graph_brpc_client.h" #include "paddle/fluid/distributed/ps/service/graph_brpc_server.h" #include "paddle/fluid/distributed/ps/service/ps_service/service.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/distributed/ps/service/ps_service/service.h b/paddle/fluid/distributed/ps/service/ps_service/service.h index 69e40da54f44f..1b9f9249c3bbe 100644 --- a/paddle/fluid/distributed/ps/service/ps_service/service.h +++ b/paddle/fluid/distributed/ps/service/ps_service/service.h @@ -19,10 +19,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/ps_client.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/distributed/ps/service/server.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/service/server.h b/paddle/fluid/distributed/ps/service/server.h index cd4e39ae450d1..32c989826811f 100644 --- a/paddle/fluid/distributed/ps/service/server.h +++ b/paddle/fluid/distributed/ps/service/server.h @@ -24,9 +24,9 @@ #include "butil/endpoint.h" #include "google/protobuf/service.h" #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/env.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" #include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/distributed/ps/table/accessor.h b/paddle/fluid/distributed/ps/table/accessor.h index 3261fb9f2ea01..b55c77bf52d84 100644 --- a/paddle/fluid/distributed/ps/table/accessor.h +++ b/paddle/fluid/distributed/ps/table/accessor.h @@ -21,7 +21,7 @@ #include "paddle/fluid/distributed/common/afs_warpper.h" #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/ctr_accessor.h b/paddle/fluid/distributed/ps/table/ctr_accessor.h index 46d991ef1d787..c9283d478feb4 100644 --- a/paddle/fluid/distributed/ps/table/ctr_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_accessor.h @@ -19,9 +19,9 @@ #include #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h index e766f4c767c43..4b69054e555c5 100644 --- a/paddle/fluid/distributed/ps/table/ctr_double_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_double_accessor.h @@ -19,9 +19,9 @@ #include #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h index 38b3e6ecae68d..a360030cb7d3d 100644 --- a/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h +++ b/paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h @@ -19,9 +19,9 @@ #include #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/sparse_accessor.h b/paddle/fluid/distributed/ps/table/sparse_accessor.h index d4fbddc934862..9a58476d8e373 100644 --- a/paddle/fluid/distributed/ps/table/sparse_accessor.h +++ b/paddle/fluid/distributed/ps/table/sparse_accessor.h @@ -19,9 +19,9 @@ #include #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h index 215a15a7d31eb..f62cffdf232e7 100644 --- a/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h +++ b/paddle/fluid/distributed/ps/table/sparse_sgd_rule.h @@ -21,7 +21,7 @@ #include "glog/logging.h" // for CHECK #include "paddle/fluid/distributed/common/local_random.h" // for local_uniform_real_distribution #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/tensor_accessor.h b/paddle/fluid/distributed/ps/table/tensor_accessor.h index 3abf8156d9331..8401746a1e887 100644 --- a/paddle/fluid/distributed/ps/table/tensor_accessor.h +++ b/paddle/fluid/distributed/ps/table/tensor_accessor.h @@ -20,8 +20,8 @@ #include #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/accessor.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/barrier_table_test.cc b/paddle/fluid/distributed/test/barrier_table_test.cc index 12c389e9766b5..31f0f0844345c 100644 --- a/paddle/fluid/distributed/test/barrier_table_test.cc +++ b/paddle/fluid/distributed/test/barrier_table_test.cc @@ -18,9 +18,9 @@ limitations under the License. */ #include #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/common_table.h" #include "paddle/fluid/distributed/ps/table/table.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc index bed37e6036a5c..d10a34ddfe324 100644 --- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc +++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc @@ -18,10 +18,10 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" #include "paddle/fluid/distributed/ps/service/env.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc index 3f2ac69bd9a74..9b71e4524625c 100644 --- a/paddle/fluid/distributed/test/ctr_accessor_test.cc +++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc index fbf179dbeeef0..39bff554ff9d2 100644 --- a/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc +++ b/paddle/fluid/distributed/test/ctr_dymf_accessor_test.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/distributed/common/registerer.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/dense_table_test.cc b/paddle/fluid/distributed/test/dense_table_test.cc index 185d9d3aed1d4..8b021e2c9624e 100644 --- a/paddle/fluid/distributed/test/dense_table_test.cc +++ b/paddle/fluid/distributed/test/dense_table_test.cc @@ -17,8 +17,8 @@ limitations under the License. */ #include #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/memory_dense_table.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc index dd085d7510b60..96769cff83bb8 100644 --- a/paddle/fluid/distributed/test/graph_node_split_test.cc +++ b/paddle/fluid/distributed/test/graph_node_split_test.cc @@ -21,7 +21,6 @@ limitations under the License. */ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" #include "paddle/fluid/distributed/ps/service/env.h" @@ -32,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps/service/ps_service/service.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 3439ffe8a0c25..de12b715deb54 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h" #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h" #include "paddle/fluid/distributed/ps/service/env.h" @@ -33,6 +32,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h" #include "paddle/fluid/distributed/ps/service/ps_service/service.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/distributed/test/graph_table_sample_test.cc b/paddle/fluid/distributed/test/graph_table_sample_test.cc index 5c05a3a70f49f..15c86e2fdd378 100644 --- a/paddle/fluid/distributed/test/graph_table_sample_test.cc +++ b/paddle/fluid/distributed/test/graph_table_sample_test.cc @@ -25,8 +25,8 @@ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/common_graph_table.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace framework = paddle::framework; namespace platform = paddle::platform; namespace operators = paddle::operators; diff --git a/paddle/fluid/distributed/test/memory_geo_table_test.cc b/paddle/fluid/distributed/test/memory_geo_table_test.cc index 507211e69fa0f..f01c40f7043b4 100644 --- a/paddle/fluid/distributed/test/memory_geo_table_test.cc +++ b/paddle/fluid/distributed/test/memory_geo_table_test.cc @@ -20,10 +20,10 @@ limitations under the License. */ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/ps/table/memory_sparse_geo_table.h" #include "paddle/fluid/distributed/ps/table/table.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/memory_sparse_table_test.cc b/paddle/fluid/distributed/test/memory_sparse_table_test.cc index 485d81a7d6856..da311a7691fc8 100644 --- a/paddle/fluid/distributed/test/memory_sparse_table_test.cc +++ b/paddle/fluid/distributed/test/memory_sparse_table_test.cc @@ -22,8 +22,8 @@ limitations under the License. */ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/table.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc index 2dfc2961f39d1..e12e2757504a5 100644 --- a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc +++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/test/table_test.cc b/paddle/fluid/distributed/test/table_test.cc index 56809abad0c7c..afeaf273174f4 100644 --- a/paddle/fluid/distributed/test/table_test.cc +++ b/paddle/fluid/distributed/test/table_test.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/table/memory_dense_table.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" //#include "paddle/fluid/distributed/ps/table/sparse_geo_table.h" namespace paddle { diff --git a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu index 03ef905b9ab48..5fc0625992c79 100644 --- a/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu +++ b/paddle/fluid/framework/fleet/heter_ps/test_sample_rate.cu @@ -25,11 +25,11 @@ #include "google/protobuf/text_format.h" #include "gtest/gtest.h" -#include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/ps/service/env.h" #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h" #include "paddle/fluid/distributed/ps/table/common_graph_table.h" #include "paddle/fluid/distributed/ps/table/graph/graph_node.h" +#include "paddle/fluid/distributed/the_one_ps.pb.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" #include "paddle/fluid/framework/fleet/heter_ps/graph_sampler.h" diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 1a805ccd76e44..7a60d5db0dfa9 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -37,7 +37,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/port.h" #ifdef PADDLE_WITH_PSLIB -#include "proto/ps.pb.h" +#include "proto/the_one_ps.pb.h" #endif namespace paddle { From c0a7830f0ce6262ff0c9507208400b1111743f0a Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Sat, 16 Jul 2022 15:05:00 +0800 Subject: [PATCH 229/250] [Phi] Migrate solve kernel to phi (#44363) * draft version * draft version * draft version * migrate solve kernel to phi * polish * polish * re useless header file, fix a bug in grad_kernel_impl * add header file in need --- paddle/fluid/operators/solve_op.cc | 11 +- paddle/fluid/operators/solve_op.cu | 25 - paddle/fluid/operators/solve_op.h | 661 ------------------ paddle/phi/infermeta/unary.cc | 19 +- paddle/phi/kernels/CMakeLists.txt | 1 + paddle/phi/kernels/cpu/solve_grad_kernel.cc | 20 + paddle/phi/kernels/cpu/solve_kernel.cc | 19 + paddle/phi/kernels/gpu/solve_grad_kernel.cu | 20 + paddle/phi/kernels/gpu/solve_kernel.cu | 19 + .../phi/kernels/impl/solve_grad_kernel_impl.h | 267 +++++++ paddle/phi/kernels/impl/solve_kernel_impl.h | 199 ++++++ paddle/phi/kernels/solve_grad_kernel.h | 30 + paddle/phi/kernels/solve_kernel.h | 27 + paddle/phi/kernels/unsqueeze_kernel.h | 13 + paddle/phi/ops/compat/solve_sig.cc | 26 + 15 files changed, 654 insertions(+), 703 deletions(-) delete mode 100644 paddle/fluid/operators/solve_op.cu delete mode 100644 paddle/fluid/operators/solve_op.h create mode 100644 paddle/phi/kernels/cpu/solve_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/solve_kernel.cc create mode 100644 paddle/phi/kernels/gpu/solve_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/solve_kernel.cu create mode 100644 paddle/phi/kernels/impl/solve_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/solve_kernel_impl.h create mode 100644 paddle/phi/kernels/solve_grad_kernel.h create mode 100644 paddle/phi/kernels/solve_kernel.h create mode 100644 paddle/phi/ops/compat/solve_sig.cc diff --git a/paddle/fluid/operators/solve_op.cc b/paddle/fluid/operators/solve_op.cc index a7bf413e10519..daa020e4a0d74 100644 --- a/paddle/fluid/operators/solve_op.cc +++ b/paddle/fluid/operators/solve_op.cc @@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/solve_op.h" - #include #include #include #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" namespace paddle { @@ -220,10 +220,3 @@ REGISTER_OPERATOR(solve, ops::SolveOpGradMaker); REGISTER_OPERATOR(solve_grad, ops::SolveGradOp); - -REGISTER_OP_CPU_KERNEL(solve, - ops::SolveKernel, - ops::SolveKernel); -REGISTER_OP_CPU_KERNEL(solve_grad, - ops::SolveGradKernel, - ops::SolveGradKernel); diff --git a/paddle/fluid/operators/solve_op.cu b/paddle/fluid/operators/solve_op.cu deleted file mode 100644 index a1e56fab5702b..0000000000000 --- a/paddle/fluid/operators/solve_op.cu +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/solve_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(solve, - ops::SolveKernel, - ops::SolveKernel); - -REGISTER_OP_CUDA_KERNEL(solve_grad, - ops::SolveGradKernel, - ops::SolveGradKernel); diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h deleted file mode 100644 index 115223749431b..0000000000000 --- a/paddle/fluid/operators/solve_op.h +++ /dev/null @@ -1,661 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "Eigen/Core" -#include "Eigen/LU" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" -#include "paddle/fluid/operators/squeeze_op.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/funcs/matrix_solve.h" -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#endif - -#define MAX_RANK_SUPPORTED 6 - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using framework::To32BitIndex; - -constexpr int kMULMKLDNNINT8 = 1; - -template -void ReduceSumForSolve(const Tensor* input, - Tensor* output, - const std::vector& reduce_dims, - bool keep_dim, - const paddle::framework::ExecutionContext& ctx) { -#if defined(__NVCC__) || defined(__HIPCC__) - auto stream = ctx.cuda_device_context().stream(); - TensorReduceImpl>( - ctx.cuda_device_context(), - *input, - output, - kps::IdentityFunctor(), - reduce_dims, - stream); -#else - ReduceKernelFunctor( - input, output, reduce_dims, keep_dim, false, ctx) - .template apply(); -#endif -} - -// check the input other is vector_case or not -static inline bool is_vector_rhs(const Tensor& input, const Tensor& other) { - auto x_dim = input.dims(); - auto y_dim = other.dims(); - auto x_dim_size = x_dim.size(); - auto y_dim_size = y_dim.size(); - std::vector x_dims_vec = phi::vectorize(x_dim); - std::vector y_dims_vec = phi::vectorize(y_dim); - - std::vector::const_iterator f = x_dims_vec.begin(); - std::vector::const_iterator l = x_dims_vec.end() - 1; - std::vector x_dims_vec_cut(f, l); // input.shape[:-1] - - std::vector expected_batched_rhs_shape(x_dims_vec_cut); - bool vector_case = - y_dim_size == 1 || (x_dim_size - 1 == y_dim_size && - y_dims_vec == (expected_batched_rhs_shape)); - - return vector_case; -} - -// unsqueeze operation helper -static framework::DDim GetOutputShapeUnsqueeze( - const std::vector unsqz_dims, const framework::DDim& in_dims) { - int output_size = in_dims.size() + static_cast(unsqz_dims.size()); - int cur_output_size = in_dims.size(); - std::vector output_shape(output_size, 0); - - // Validity Check: rank range. - PADDLE_ENFORCE_LE(output_size, - 6, - platform::errors::InvalidArgument( - "The output " - "tensor's rank should be less than 6.")); - - for (int axis : unsqz_dims) { - int cur = axis < 0 ? axis + cur_output_size + 1 : axis; - // Vaildity Check: the axis bound - PADDLE_ENFORCE_GE( - cur, - 0, - platform::errors::InvalidArgument("The insert dimension value should " - "not be less than 0")); - PADDLE_ENFORCE_LE(cur, - cur_output_size, - platform::errors::InvalidArgument( - "The insert dimension value shoule not be larger " - "than the dimension size of input tensor")); - // Move old axis, and insert new axis - for (int i = cur_output_size; i >= cur; --i) { - if (output_shape[i] == 1) { - // Move axis - output_shape[i + 1] = 1; - output_shape[i] = 0; - } - } - output_shape[cur] = 1; - // Add the output size. - cur_output_size++; - } - - // Make output shape - for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) { - if (output_shape[out_idx] == 0) { - output_shape[out_idx] = in_dims[in_idx++]; - } - } - - return phi::make_ddim(output_shape); -} - -// operation like squeeze(-1) -static void to_squeeze(const framework::ExecutionContext& context, - const framework::Tensor& in, - framework::Tensor* out) { - auto x_dims = in.dims(); - std::vector sqz_dims = {-1}; - auto out_dims = GetOutputShape(sqz_dims, x_dims, true); - out->mutable_data(context.GetPlace(), in.type()); - framework::TensorCopy( - in, - context.GetPlace(), - context.template device_context(), - out); - out->Resize(out_dims); -} - -// vector_case, need to operate like unsqueeze(-1) -static void to_unsqueeze(const framework::ExecutionContext& context, - const framework::Tensor& in, - framework::Tensor* out) { - auto x_dims = in.dims(); - std::vector unsqz_dims = {-1}; - framework::DDim out_dims = out->dims(); - out_dims = GetOutputShapeUnsqueeze(unsqz_dims, x_dims); - framework::TensorCopy( - in, - context.GetPlace(), - context.template device_context(), - out); - out->Resize(out_dims); -} - -// Prepared for the broadcast operation -static std::vector get_broadcast_batch_portion( - std::vector x, std::vector y) { - size_t size_x = x.size(); - size_t size_y = y.size(); - size_t size = std::max(size_x, size_y); - std::vector batchPortion(size); - - ptrdiff_t i = (ptrdiff_t)size - 1; - for (; i >= 0; --i) { - ptrdiff_t offset = size - i - 1; - ptrdiff_t dim_x = size_x - offset - 1; - ptrdiff_t dim_y = size_y - offset - 1; - int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1; - int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1; - - PADDLE_ENFORCE_EQ( - (x_size == y_size || x_size == 1 || y_size == 1), - true, - platform::errors::PreconditionNotMet( - "The size of tensor x (%d) must match the size of tensor y " - "(%d) at non-singleton dimension %d.", - x_size, - y_size, - i)); - - batchPortion[i] = x_size != 1 ? x_size : y_size; - } - return batchPortion; -} - -// broadcast the batch dimensions of tensor x and tensor y. -static inline std::tuple, std::vector> -get_broadcast_dims(const Tensor& x, const Tensor& y) { - std::vector x_dims_vec = phi::vectorize(x.dims()); - std::vector y_dims_vec = phi::vectorize(y.dims()); - - std::vector::const_iterator f1 = x_dims_vec.begin(); - std::vector::const_iterator l1 = x_dims_vec.end() - 2; - std::vector x_dims_vec_cut(f1, l1); - - std::vector::const_iterator f2 = y_dims_vec.begin(); - std::vector::const_iterator l2 = y_dims_vec.end() - 2; - std::vector y_dims_vec_cut(f2, l2); - - std::vector expand_batch_portion = - get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut); - - std::vector x_expand_size({expand_batch_portion}); - x_expand_size.insert(x_expand_size.end(), - {x_dims_vec[static_cast(x_dims_vec.size()) - 2], - x_dims_vec[static_cast(x_dims_vec.size()) - 1]}); - - std::vector y_expand_size({expand_batch_portion}); - y_expand_size.insert(y_expand_size.end(), - {y_dims_vec[static_cast(y_dims_vec.size()) - 2], - y_dims_vec[static_cast(y_dims_vec.size()) - 1]}); - - return std::make_tuple(x_expand_size, y_expand_size); -} - -template -void expand_impl(const DeviceContext& context, - const Tensor& in, - Tensor* out, - const std::vector& expand_shape) { - auto vec_in_dims = phi::vectorize(in.dims()); - auto diff = expand_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector repeat_times(vec_in_dims.size()); - - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - PADDLE_ENFORCE_NE( - expand_shape[i], - 0, - platform::errors::InvalidArgument("The expanded size cannot be zero.")); - if (i < diff) { - PADDLE_ENFORCE_GT( - expand_shape[i], - 0, - platform::errors::InvalidArgument( - "The expanded size (%d) for non-existing dimensions must be " - "positive for expand operation.", - expand_shape[i])); - repeat_times[i] = expand_shape[i]; - } else if (expand_shape[i] > 0) { - if (vec_in_dims[i] != 1) { - PADDLE_ENFORCE_EQ( - vec_in_dims[i], - expand_shape[i], - platform::errors::InvalidArgument( - "The value (%d) of the non-singleton dimension does not match" - " the corresponding value (%d) in shape for expand operation.", - vec_in_dims[i], - expand_shape[i])); - repeat_times[i] = 1; - } else { - repeat_times[i] = expand_shape[i]; - } - } else { - PADDLE_ENFORCE_EQ( - expand_shape[i], - -1, - platform::errors::InvalidArgument( - "When the value in shape is negative for expand_v2 op, " - "only -1 is supported, but the value received is %d.", - expand_shape[i])); - repeat_times[i] = 1; - } - } - - Eigen::DSizes bcast_dims; - for (size_t i = 0; i < repeat_times.size(); ++i) { - bcast_dims[i] = repeat_times[i]; - } - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims(new_in_dims); - for (size_t i = 0; i < repeat_times.size(); ++i) { - out_dims[i] *= repeat_times[i]; - } - - out->Resize(out_dims); - out->mutable_data(context.GetPlace()); - auto x = EigenTensor::From(in, new_in_dims); - auto y = EigenTensor::From(*out, out_dims); - auto& place = *context.eigen_device(); - // use 32-bit index to speed up - bool use_32bit_index = y.size() < Eigen::NumTraits::highest(); - if (use_32bit_index) { - EigenBroadcast, T, Rank>::Eval( - place, To32BitIndex(y), To32BitIndex(x), bcast_dims); - } else { - EigenBroadcast, T, Rank>::Eval( - place, y, x, bcast_dims); - } -} - -template -void TensorExpand(const DeviceContext& context, - const Tensor& in, - Tensor* out, - const std::vector& expand_shape) { - // necessary check before expand operation - PADDLE_ENFORCE_GE(expand_shape.size(), - in.dims().size(), - platform::errors::InvalidArgument( - "The size of 'expand_shape' (%d) should >= the input " - "Tensor's rank (%d).", - expand_shape.size(), - in.dims().size())); - PADDLE_ENFORCE_LE(expand_shape.size(), - MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The size of 'expand_shape' (%d) should be <= %d", - expand_shape.size(), - MAX_RANK_SUPPORTED)); - switch (expand_shape.size()) { - case 1: - expand_impl<1, T, DeviceContext>(context, in, out, expand_shape); - break; - case 2: - expand_impl<2, T, DeviceContext>(context, in, out, expand_shape); - break; - case 3: - expand_impl<3, T, DeviceContext>(context, in, out, expand_shape); - break; - case 4: - expand_impl<4, T, DeviceContext>(context, in, out, expand_shape); - break; - case 5: - expand_impl<5, T, DeviceContext>(context, in, out, expand_shape); - break; - case 6: - expand_impl<6, T, DeviceContext>(context, in, out, expand_shape); - break; - } -} - -template -static void linalg_solve(const framework::ExecutionContext& context, - const framework::Tensor* x, - const framework::Tensor* y, - framework::Tensor* out) { - out->mutable_data(context.GetPlace()); - - auto& dev_ctx = context.template device_context(); - phi::funcs::MatrixSolveFunctor mat_solve; - - // input y can be vector or matrix - // but need to be unsqueezed if y is a vector - bool is_vector = false; - is_vector = is_vector_rhs(*x, *y); - - Tensor tmp_y; - if (is_vector) { - tmp_y.mutable_data(context.GetPlace(), y->dtype()); - to_unsqueeze(context, *y, &tmp_y); - } else { - tmp_y.Resize(y->dims()); - tmp_y.mutable_data(context.GetPlace(), y->dtype()); - framework::TensorCopy( - *y, - context.GetPlace(), - context.template device_context(), - &tmp_y); - } - - Tensor tmp_x; - tmp_x.Resize(x->dims()); - tmp_x.mutable_data(context.GetPlace(), x->dtype()); - framework::TensorCopy( - *x, - context.GetPlace(), - context.template device_context(), - &tmp_x); - - std::vector x_broadcast_dims; - std::vector y_broadcast_dims; - std::tie(x_broadcast_dims, y_broadcast_dims) = - get_broadcast_dims(tmp_x, tmp_y); - - Tensor tmp_x_bc; - TensorExpand(dev_ctx, tmp_x, &tmp_x_bc, x_broadcast_dims); - - Tensor tmp_y_bc; - TensorExpand(dev_ctx, tmp_y, &tmp_y_bc, y_broadcast_dims); - - auto x_dim = x->dims(); - auto y_dim = y->dims(); - auto x_dim_size = x_dim.size(); - auto y_dim_size = y_dim.size(); - - if (is_vector) { // vector case - out->Resize(tmp_y_bc.dims()); // out.unsqueeze(-1) - mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out); - - Tensor out_tmp; - out_tmp.Resize(out->dims()); - out_tmp = *out; - to_squeeze(context, out_tmp, out); // out.squeeze(-1) - } else { - PADDLE_ENFORCE_EQ( - x_dim[x_dim_size - 1], - y_dim[y_dim_size - 2], - platform::errors::InvalidArgument( - "Matrix X1 with dimension greater than 2 and any matrix Y1," - "the matrix X1's width must be equal with matrix Y1's " - "height. But received X's shape = [%s], X1's shape = [%s], X1's " - "width = %s; Y's shape = [%s], Y1's shape = [%s], Y1's height = " - "%s.", - x_dim, - x_dim, - x_dim[x_dim_size - 1], - y_dim, - y_dim, - y_dim[y_dim_size - 2])); - mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out); - } -} - -template -class SolveKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto* x = context.Input("X"); - const auto* y = context.Input("Y"); - Tensor* out = context.Output("Out"); - linalg_solve(context, x, y, out); - } -}; - -template -class SolveGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - // reuse the linalg.solve forward output - auto* out = ctx.Input("Out"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - bool is_vector = false; - is_vector = is_vector_rhs(*input, *y); - - Tensor tmp_y; - if (is_vector) { - tmp_y.mutable_data(ctx.GetPlace(), y->dtype()); - to_unsqueeze(ctx, *y, &tmp_y); - } else { - tmp_y.Resize(y->dims()); - tmp_y.mutable_data(ctx.GetPlace(), y->dtype()); - framework::TensorCopy( - *y, - ctx.GetPlace(), - ctx.template device_context(), - &tmp_y); - } - - Tensor tmp_x; - tmp_x.Resize(input->dims()); - tmp_x.mutable_data(ctx.GetPlace(), input->dtype()); - framework::TensorCopy( - *input, - ctx.GetPlace(), - ctx.template device_context(), - &tmp_x); - - std::vector x_broadcast_dims; - std::vector y_broadcast_dims; - std::tie(x_broadcast_dims, y_broadcast_dims) = - get_broadcast_dims(tmp_x, tmp_y); - - // tmp_dx - Tensor tmp_dx; - tmp_dx.Resize(phi::make_ddim(x_broadcast_dims)); - tmp_dx.mutable_data(ctx.GetPlace()); - - // tmp_dy - Tensor tmp_dy; - tmp_dy.Resize(phi::make_ddim(y_broadcast_dims)); - tmp_dy.mutable_data(ctx.GetPlace()); - - Tensor tmp_input(input->dtype()); - const auto& new_dims_vec = phi::funcs::getNewDimsVec(input->dims()); - tmp_input.Resize(phi::make_ddim(new_dims_vec)); - tmp_input.mutable_data(ctx.GetPlace()); - phi::funcs::TransposeNormal trans; - std::vector new_axis = phi::funcs::getNewAxis(input->dims().size()); - auto& dev_ctx = ctx.template device_context(); - trans(dev_ctx, *input, &tmp_input, new_axis); - - if (dy) { - dy->mutable_data(ctx.GetPlace()); - // reuse linalg_solve forward logics to get tmp_dy - linalg_solve(ctx, &tmp_input, dout, &tmp_dy); - } - - if (dx) { - dx->mutable_data(ctx.GetPlace()); - // to get dx - auto blas = phi::funcs::GetBlas(ctx); - if (input->dims().size() == 2 && y->dims().size() == 2) { - auto mat_dim_a1 = - phi::funcs::CreateMatrixDescriptor(tmp_dy.dims(), 0, false); - auto mat_dim_b1 = - phi::funcs::CreateMatrixDescriptor(out->dims(), 0, true); - blas.MatMul(tmp_dy, mat_dim_a1, *out, mat_dim_b1, T(-1), &tmp_dx, T(0)); - } else if (is_vector_rhs(*input, *y)) { - Tensor tmp_dy_; - tmp_dy_.mutable_data(ctx.GetPlace(), y->dtype()); - to_unsqueeze(ctx, tmp_dy, &tmp_dy_); - - Tensor tmp_out_; - tmp_out_.mutable_data(ctx.GetPlace(), out->dtype()); - to_unsqueeze(ctx, *out, &tmp_out_); - - auto mat_dim_a1 = - phi::funcs::CreateMatrixDescriptor(tmp_dy_.dims(), 0, false); - auto mat_dim_b1 = - phi::funcs::CreateMatrixDescriptor(tmp_out_.dims(), 0, true); - blas.MatMul( - tmp_dy_, mat_dim_a1, tmp_out_, mat_dim_b1, T(-1), &tmp_dx, T(0)); - } else { - auto mat_dim_a1 = - phi::funcs::CreateMatrixDescriptor(tmp_dy.dims(), 0, false); - auto mat_dim_b1 = - phi::funcs::CreateMatrixDescriptor(out->dims(), 0, true); - blas.MatMul(tmp_dy, mat_dim_a1, *out, mat_dim_b1, T(-1), &tmp_dx, T(0)); - } - } - - if (y->dims() != tmp_dy.dims()) { - Tensor dy_help; - dy_help.Resize(tmp_dy.dims()); - dy_help.mutable_data(ctx.GetPlace(), tmp_dy.dtype()); - framework::TensorCopy( - tmp_dy, - ctx.GetPlace(), - ctx.template device_context(), - &dy_help); - - // get dims - std::vector x_dims = vectorize(input->dims()); - std::vector y_dims = vectorize(y->dims()); - std::vector dout_dims = vectorize(dout->dims()); - - if (is_vector_rhs(*input, *y)) { - dout_dims.push_back(1); - } - - int y_ndim = y_dims.size(); - int ndim = dout_dims.size(); - - const std::vector dy_help_dims = vectorize(dy_help.dims()); - std::vector dy_broadcast_dims(ndim); - - std::fill(dy_broadcast_dims.data(), - dy_broadcast_dims.data() + ndim - y_ndim, - 1); - std::copy(y_dims.data(), - y_dims.data() + y_ndim, - dy_broadcast_dims.data() + ndim - y_ndim); - - std::vector dy_reduce_dims; - for (int idx = 0; idx <= ndim - 3; idx++) { - if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) { - dy_reduce_dims.push_back(idx); - } - } - // reduce sum to get grad by ReduceSum - if (dy) { - if (dy_reduce_dims.empty()) { - *dy = std::move(dy_help); - } else { - bool keep_dim = true; - if (dy_help.dims().size() != dy->dims().size()) { - keep_dim = false; - } - ReduceSumForSolve( - &dy_help, dy, dy_reduce_dims, keep_dim, ctx); - } - dy->Resize(y->dims()); - } - } else { - framework::TensorCopy( - tmp_dy, - ctx.GetPlace(), - ctx.template device_context(), - dy); - } - - if (input->dims() != tmp_dx.dims()) { - Tensor dx_help; - dx_help.Resize(tmp_dx.dims()); - dx_help.mutable_data(ctx.GetPlace(), tmp_dx.dtype()); - framework::TensorCopy( - tmp_dx, - ctx.GetPlace(), - ctx.template device_context(), - &dx_help); - - // get dims - std::vector x_dims = vectorize(input->dims()); - std::vector y_dims = vectorize(y->dims()); - - int x_ndim = x_dims.size(); - int ndim = x_broadcast_dims.size(); - - const std::vector dx_help_dims = vectorize(dx_help.dims()); - std::vector dx_broadcast_dims(ndim); - - std::fill(dx_broadcast_dims.data(), - dx_broadcast_dims.data() + ndim - x_ndim, - 1); - std::copy(x_dims.data(), - x_dims.data() + x_ndim, - dx_broadcast_dims.data() + ndim - x_ndim); - - std::vector dx_reduce_dims; - for (int idx = 0; idx <= ndim - 3; idx++) { - if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) { - dx_reduce_dims.push_back(idx); - } - } - // reduce sum to get grad by ReduceSum - if (dx) { - dx->mutable_data(ctx.GetPlace()); - if (dx_reduce_dims.empty()) { - *dx = std::move(dx_help); - } else { - bool keep_dim = true; - if (dx_help.dims().size() != dx->dims().size()) { - keep_dim = false; - } - ReduceSumForSolve( - &dx_help, dx, dx_reduce_dims, keep_dim, ctx); - } - dx->Resize(input->dims()); - } - } else { - framework::TensorCopy( - tmp_dx, - ctx.GetPlace(), - ctx.template device_context(), - dx); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 02f812f9b17c0..5958f0e71e76a 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -3211,15 +3211,18 @@ void UnsqueezeInferMeta(const MetaTensor& x, } out->set_dtype(x.dtype()); } - // set xshape dims. - std::vector xshape_dims(x_dims.size() + 1); - xshape_dims[0] = 0; - for (int i = 0; i < x_dims.size(); ++i) { - xshape_dims[i + 1] = x_dims[i]; + if (xshape) { + // set xshape dims. + std::vector xshape_dims(x_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < x_dims.size(); ++i) { + xshape_dims[i + 1] = x_dims[i]; + } + + xshape->set_dims(phi::make_ddim(xshape_dims)); + xshape->share_lod(x); + xshape->set_dtype(x.dtype()); } - xshape->set_dims(phi::make_ddim(xshape_dims)); - xshape->share_lod(x); - xshape->set_dtype(x.dtype()); } void UnStackInferMeta(const MetaTensor& x, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 455d42b548606..05abcbd0d1964 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -62,6 +62,7 @@ set(COMMON_KERNEL_DEPS pooling maxouting matrix_inverse + matrix_solve phi_dynload_warpctc sequence_padding sequence_scale) diff --git a/paddle/phi/kernels/cpu/solve_grad_kernel.cc b/paddle/phi/kernels/cpu/solve_grad_kernel.cc new file mode 100644 index 0000000000000..3b11d49259fd6 --- /dev/null +++ b/paddle/phi/kernels/cpu/solve_grad_kernel.cc @@ -0,0 +1,20 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/solve_grad_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/solve_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + solve_grad, CPU, ALL_LAYOUT, phi::SolveGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/solve_kernel.cc b/paddle/phi/kernels/cpu/solve_kernel.cc new file mode 100644 index 0000000000000..bde049bcc3ec0 --- /dev/null +++ b/paddle/phi/kernels/cpu/solve_kernel.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/solve_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/solve_kernel_impl.h" + +PD_REGISTER_KERNEL(solve, CPU, ALL_LAYOUT, phi::SolveKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/solve_grad_kernel.cu b/paddle/phi/kernels/gpu/solve_grad_kernel.cu new file mode 100644 index 0000000000000..c13c3b6545c44 --- /dev/null +++ b/paddle/phi/kernels/gpu/solve_grad_kernel.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/solve_grad_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/solve_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + solve_grad, GPU, ALL_LAYOUT, phi::SolveGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/solve_kernel.cu b/paddle/phi/kernels/gpu/solve_kernel.cu new file mode 100644 index 0000000000000..59bc77ca0b975 --- /dev/null +++ b/paddle/phi/kernels/gpu/solve_kernel.cu @@ -0,0 +1,19 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/solve_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/solve_kernel_impl.h" + +PD_REGISTER_KERNEL(solve, GPU, ALL_LAYOUT, phi::SolveKernel, float, double) {} diff --git a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h new file mode 100644 index 0000000000000..55ee023cb5caa --- /dev/null +++ b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h @@ -0,0 +1,267 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/expand_as_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_solve.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/impl/solve_kernel_impl.h" +#include "paddle/phi/kernels/squeeze_kernel.h" +#include "paddle/phi/kernels/unsqueeze_kernel.h" + +#if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/phi/kernels/gpu/reduce.h" +#endif + +namespace phi { + +template +struct ReduceSumForSolvelGrad { + void operator()(const Context& dev_ctx, + const DenseTensor& input, + DenseTensor* output, + const std::vector& reduce_dims, + bool keep_dims); +}; + +template +struct ReduceSumForSolvelGrad { + void operator()(const CPUContext& dev_ctx, + const DenseTensor& input, + DenseTensor* output, + const std::vector& reduce_dims, + bool keep_dims) { + std::vector reduce_dims_tmp(reduce_dims.begin(), + reduce_dims.end()); + phi::ReduceKernelImpl( + dev_ctx, input, output, reduce_dims_tmp, keep_dims, false); + } +}; + +#if defined(__NVCC__) || defined(__HIPCC__) +template +struct ReduceSumForSolvelGrad { + void operator()(const GPUContext& dev_ctx, + const DenseTensor& input, + DenseTensor* output, + const std::vector& reduce_dims, + bool keep_dims) { + phi::funcs::ReduceKernel>( + dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims); + } +}; +#endif + +template +void SolveGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + const DenseTensor& out, + DenseTensor* dx, + DenseTensor* dy) { + bool is_vector = false; + is_vector = is_vector_rhs(x, y); + DenseTensor tmp_y; + if (is_vector) { + dev_ctx.Alloc(&tmp_y, y.dtype()); + phi::Unsqueeze(dev_ctx, y, {-1}, &tmp_y, nullptr); + } else { + tmp_y.Resize(y.dims()); + dev_ctx.Alloc(&tmp_y, y.dtype()); + phi::Copy(dev_ctx, y, dev_ctx.GetPlace(), false, &tmp_y); + } + DenseTensor tmp_x; + tmp_x.Resize(x.dims()); + dev_ctx.Alloc(&tmp_x, x.dtype()); + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &tmp_x); + + std::vector x_broadcast_dims; + std::vector y_broadcast_dims; + std::tie(x_broadcast_dims, y_broadcast_dims) = + get_broadcast_dims(tmp_x, tmp_y); + // tmp_dx + DenseTensor tmp_dx; + tmp_dx.Resize(phi::make_ddim(x_broadcast_dims)); + dev_ctx.template Alloc(&tmp_dx); + + // tmp_dy + DenseTensor tmp_dy; + tmp_dy.Resize(phi::make_ddim(y_broadcast_dims)); + dev_ctx.template Alloc(&tmp_dy); + + DenseTensor tmp_input(x.dtype()); + const auto& new_dims_vec = phi::funcs::getNewDimsVec(x.dims()); + tmp_input.Resize(phi::make_ddim(new_dims_vec)); + dev_ctx.template Alloc(&tmp_input); + + phi::funcs::TransposeNormal trans; + std::vector new_axis = phi::funcs::getNewAxis(x.dims().size()); + trans(dev_ctx, x, &tmp_input, new_axis); + + if (dy) { + dev_ctx.template Alloc(dy); + linalg_solve(dev_ctx, tmp_input, dout, &tmp_dy); + } + + if (dx) { + dev_ctx.template Alloc(dx); + + // to get dx + auto blas = phi::funcs::GetBlas(dev_ctx); + if (x.dims().size() == 2 && y.dims().size() == 2) { + auto mat_dim_a1 = + phi::funcs::CreateMatrixDescriptor(tmp_dy.dims(), 0, false); + auto mat_dim_b1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); + blas.MatMul(tmp_dy, mat_dim_a1, out, mat_dim_b1, T(-1), &tmp_dx, T(0)); + + } else if (is_vector_rhs(x, y)) { + DenseTensor tmp_dy_; + dev_ctx.Alloc(&tmp_dy_, y.dtype()); + + phi::Unsqueeze(dev_ctx, + tmp_dy, + paddle::experimental::IntArray({-1}), + &tmp_dy_, + nullptr); + + DenseTensor tmp_out_; + dev_ctx.Alloc(&tmp_out_, out.dtype()); + + phi::Unsqueeze(dev_ctx, + out, + paddle::experimental::IntArray({-1}), + &tmp_out_, + nullptr); + + auto mat_dim_a1 = + phi::funcs::CreateMatrixDescriptor(tmp_dy_.dims(), 0, false); + auto mat_dim_b1 = + phi::funcs::CreateMatrixDescriptor(tmp_out_.dims(), 0, true); + blas.MatMul( + tmp_dy_, mat_dim_a1, tmp_out_, mat_dim_b1, T(-1), &tmp_dx, T(0)); + + } else { + auto mat_dim_a1 = + phi::funcs::CreateMatrixDescriptor(tmp_dy.dims(), 0, false); + auto mat_dim_b1 = phi::funcs::CreateMatrixDescriptor(out.dims(), 0, true); + blas.MatMul(tmp_dy, mat_dim_a1, out, mat_dim_b1, T(-1), &tmp_dx, T(0)); + } + } + if (y.dims() != tmp_dy.dims()) { + DenseTensor dy_help; + dy_help.Resize(tmp_dy.dims()); + dev_ctx.Alloc(&dy_help, tmp_dy.dtype()); + + phi::Copy(dev_ctx, tmp_dy, dev_ctx.GetPlace(), false, &dy_help); + + // get dims + std::vector x_dims = vectorize(x.dims()); + std::vector y_dims = vectorize(y.dims()); + std::vector dout_dims = vectorize(dout.dims()); + + if (is_vector_rhs(x, y)) { + dout_dims.push_back(1); + } + + int y_ndim = y_dims.size(); + int ndim = dout_dims.size(); + + const std::vector dy_help_dims = vectorize(dy_help.dims()); + std::vector dy_broadcast_dims(ndim); + + std::fill( + dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1); + std::copy(y_dims.data(), + y_dims.data() + y_ndim, + dy_broadcast_dims.data() + ndim - y_ndim); + + std::vector dy_reduce_dims; + for (int idx = 0; idx <= ndim - 3; idx++) { + if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) { + dy_reduce_dims.push_back(idx); + } + } + // reduce sum to get grad by ReduceSum + if (dy) { + if (dy_reduce_dims.empty()) { + *dy = std::move(dy_help); + } else { + bool keep_dim = true; + if (dy_help.dims().size() != dy->dims().size()) { + keep_dim = false; + } + ReduceSumForSolvelGrad()( + dev_ctx, dy_help, dy, dy_reduce_dims, keep_dim); + } + dy->Resize(y.dims()); + } + } else { + phi::Copy(dev_ctx, tmp_dy, dev_ctx.GetPlace(), false, dy); + } + + if (x.dims() != tmp_dx.dims()) { + DenseTensor dx_help; + dx_help.Resize(tmp_dx.dims()); + dev_ctx.Alloc(&dx_help, tmp_dx.dtype()); + phi::Copy(dev_ctx, tmp_dx, dev_ctx.GetPlace(), false, &dx_help); + // get dims + std::vector x_dims = vectorize(x.dims()); + std::vector y_dims = vectorize(y.dims()); + + int x_ndim = x_dims.size(); + int ndim = x_broadcast_dims.size(); + + const std::vector dx_help_dims = vectorize(dx_help.dims()); + std::vector dx_broadcast_dims(ndim); + std::fill( + dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1); + std::copy(x_dims.data(), + x_dims.data() + x_ndim, + dx_broadcast_dims.data() + ndim - x_ndim); + + std::vector dx_reduce_dims; + for (int idx = 0; idx <= ndim - 3; idx++) { + if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) { + dx_reduce_dims.push_back(idx); + } + } + // reduce sum to get grad by ReduceSum + if (dx) { + dev_ctx.template Alloc(dx); + + if (dx_reduce_dims.empty()) { + *dx = std::move(dx_help); + } else { + bool keep_dim = true; + if (dx_help.dims().size() != dx->dims().size()) { + keep_dim = false; + } + ReduceSumForSolvelGrad()( + dev_ctx, dx_help, dx, dx_reduce_dims, keep_dim); + } + dx->Resize(x.dims()); + } + } else { + phi::Copy(dev_ctx, tmp_dx, dev_ctx.GetPlace(), false, dx); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/solve_kernel_impl.h b/paddle/phi/kernels/impl/solve_kernel_impl.h new file mode 100644 index 0000000000000..09c9e74dd207a --- /dev/null +++ b/paddle/phi/kernels/impl/solve_kernel_impl.h @@ -0,0 +1,199 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/tensor_utils.h" +#include "paddle/phi/kernels/expand_as_kernel.h" +#include "paddle/phi/kernels/funcs/matrix_solve.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" +#include "paddle/phi/kernels/squeeze_kernel.h" +#include "paddle/phi/kernels/unsqueeze_kernel.h" + +namespace phi { + +using Tensor = DenseTensor; + +// check the input other is vector_case or not +static inline bool is_vector_rhs(const DenseTensor& input, + const DenseTensor& other) { + auto x_dim = input.dims(); + auto y_dim = other.dims(); + auto x_dim_size = x_dim.size(); + auto y_dim_size = y_dim.size(); + std::vector x_dims_vec = phi::vectorize(x_dim); + std::vector y_dims_vec = phi::vectorize(y_dim); + + std::vector::const_iterator f = x_dims_vec.begin(); + std::vector::const_iterator l = x_dims_vec.end() - 1; + std::vector x_dims_vec_cut(f, l); // input.shape[:-1] + + std::vector expected_batched_rhs_shape(x_dims_vec_cut); + bool vector_case = + y_dim_size == 1 || (x_dim_size - 1 == y_dim_size && + y_dims_vec == (expected_batched_rhs_shape)); + + return vector_case; +} + +// Prepared for the broadcast operation +static std::vector get_broadcast_batch_portion( + std::vector x, std::vector y) { + size_t size_x = x.size(); + size_t size_y = y.size(); + size_t size = std::max(size_x, size_y); + std::vector batchPortion(size); + ptrdiff_t i = (ptrdiff_t)size - 1; + for (; i >= 0; --i) { + ptrdiff_t offset = size - i - 1; + ptrdiff_t dim_x = size_x - offset - 1; + ptrdiff_t dim_y = size_y - offset - 1; + int64_t x_size = (dim_x >= 0) ? x[dim_x] : 1; + int64_t y_size = (dim_y >= 0) ? y[dim_y] : 1; + PADDLE_ENFORCE_EQ( + (x_size == y_size || x_size == 1 || y_size == 1), + true, + phi::errors::PreconditionNotMet( + "The size of tensor x (%d) must match the size of tensor y " + "(%d) at non-singleton dimension %d.", + x_size, + y_size, + i)); + + batchPortion[i] = x_size != 1 ? x_size : y_size; + } + return batchPortion; +} + +static inline std::vector convert_to_int_vec(std::vector a) { + std::vector ret; + for (size_t i = 0; i < a.size(); i++) { + ret.emplace_back(int(a[i])); + } + + return ret; +} + +// broadcast the batch dimensions of tensor x and tensor y. +static inline std::tuple, std::vector> +get_broadcast_dims(const Tensor& x, const Tensor& y) { + std::vector x_dims_vec = phi::vectorize(x.dims()); + std::vector y_dims_vec = phi::vectorize(y.dims()); + std::vector::const_iterator f1 = x_dims_vec.begin(); + std::vector::const_iterator l1 = x_dims_vec.end() - 2; + std::vector x_dims_vec_cut(f1, l1); + + std::vector::const_iterator f2 = y_dims_vec.begin(); + std::vector::const_iterator l2 = y_dims_vec.end() - 2; + std::vector y_dims_vec_cut(f2, l2); + + std::vector expand_batch_portion = + get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut); + std::vector x_expand_size({expand_batch_portion}); + x_expand_size.insert(x_expand_size.end(), + {x_dims_vec[static_cast(x_dims_vec.size()) - 2], + x_dims_vec[static_cast(x_dims_vec.size()) - 1]}); + std::vector y_expand_size({expand_batch_portion}); + y_expand_size.insert(y_expand_size.end(), + {y_dims_vec[static_cast(y_dims_vec.size()) - 2], + y_dims_vec[static_cast(y_dims_vec.size()) - 1]}); + + return std::make_tuple(x_expand_size, y_expand_size); +} + +template +static void linalg_solve(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + dev_ctx.template Alloc(out); + phi::funcs::MatrixSolveFunctor mat_solve; + + // input y can be vector or matrix + // but need to be unsqueezed if y is a vector + bool is_vector = false; + is_vector = is_vector_rhs(x, y); + + Tensor tmp_y; + if (is_vector) { + dev_ctx.Alloc(&tmp_y, y.dtype()); + + phi::Unsqueeze(dev_ctx, y, {-1}, &tmp_y, nullptr); + } else { + tmp_y.Resize(y.dims()); + dev_ctx.Alloc(&tmp_y, y.dtype()); + + phi::Copy(dev_ctx, y, dev_ctx.GetPlace(), false, &tmp_y); + } + + Tensor tmp_x; + tmp_x.Resize(x.dims()); + dev_ctx.Alloc(&tmp_x, x.dtype()); + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &tmp_x); + + std::vector x_broadcast_dims; + std::vector y_broadcast_dims; + std::tie(x_broadcast_dims, y_broadcast_dims) = + get_broadcast_dims(tmp_x, tmp_y); + + Tensor tmp_x_bc; + + phi::ExpandAsKernel( + dev_ctx, tmp_x, nullptr, convert_to_int_vec(x_broadcast_dims), &tmp_x_bc); + + Tensor tmp_y_bc; + phi::ExpandAsKernel( + dev_ctx, tmp_y, nullptr, convert_to_int_vec(y_broadcast_dims), &tmp_y_bc); + + auto x_dim = x.dims(); + auto y_dim = y.dims(); + auto x_dim_size = x_dim.size(); + auto y_dim_size = y_dim.size(); + + if (is_vector) { // vector case + out->Resize(tmp_y_bc.dims()); // out.unsqueeze(-1) + mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out); + + Tensor out_tmp; + out_tmp.Resize(out->dims()); + out_tmp = *out; + + phi::SqueezeKernel(dev_ctx, out_tmp, {-1}, out, nullptr); + } else { + PADDLE_ENFORCE_EQ( + x_dim[x_dim_size - 1], + y_dim[y_dim_size - 2], + phi::errors::InvalidArgument( + "Matrix X1 with dimension greater than 2 and any matrix Y1," + "the matrix X1's width must be equal with matrix Y1's " + "height. But received X's shape = [%s], X1's shape = [%s], X1's " + "width = %s; Y's shape = [%s], Y1's shape = [%s], Y1's height = " + "%s.", + x_dim, + x_dim, + x_dim[x_dim_size - 1], + y_dim, + y_dim, + y_dim[y_dim_size - 2])); + mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out); + } +} + +template +void SolveKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + linalg_solve(dev_ctx, x, y, out); +} + +} // namespace phi diff --git a/paddle/phi/kernels/solve_grad_kernel.h b/paddle/phi/kernels/solve_grad_kernel.h new file mode 100644 index 0000000000000..31bdb9932becc --- /dev/null +++ b/paddle/phi/kernels/solve_grad_kernel.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SolveGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + const DenseTensor& out, + DenseTensor* dx, + DenseTensor* dy); + +} // namespace phi diff --git a/paddle/phi/kernels/solve_kernel.h b/paddle/phi/kernels/solve_kernel.h new file mode 100644 index 0000000000000..28dddb0f641bd --- /dev/null +++ b/paddle/phi/kernels/solve_kernel.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SolveKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/unsqueeze_kernel.h b/paddle/phi/kernels/unsqueeze_kernel.h index 4622a9b0a859c..62ba878c056cb 100644 --- a/paddle/phi/kernels/unsqueeze_kernel.h +++ b/paddle/phi/kernels/unsqueeze_kernel.h @@ -17,6 +17,7 @@ #include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" namespace phi { @@ -26,4 +27,16 @@ void UnsqueezeKernel(const Context& dev_ctx, const IntArray& axes, DenseTensor* out, DenseTensor* xshape); + +template +void Unsqueeze(const Context& dev_ctx, + const DenseTensor& x, + const IntArray& axes, + DenseTensor* out, + DenseTensor* xshape) { + MetaTensor meta_out(out); + UnsqueezeInferMeta(x, axes, &meta_out, nullptr, MetaConfig()); + UnsqueezeKernel(dev_ctx, x, axes, out, nullptr); +} + } // namespace phi diff --git a/paddle/phi/ops/compat/solve_sig.cc b/paddle/phi/ops/compat/solve_sig.cc new file mode 100644 index 0000000000000..9771adee8e983 --- /dev/null +++ b/paddle/phi/ops/compat/solve_sig.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature SolveGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "solve_grad", {"X", "Y", "Out@GRAD", "Out"}, {}, {"X@GRAD", "Y@GRAD"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(solve_grad, phi::SolveGradOpArgumentMapping); From 876e2ff1f62fde4d3dc56f7dd1403c659fbfb0b9 Mon Sep 17 00:00:00 2001 From: caozhou <48191911+Caozhou1995@users.noreply.github.com> Date: Mon, 18 Jul 2022 10:06:23 +0800 Subject: [PATCH 230/250] [auto parallel] remove comm init control (#44385) --- .../distributed/auto_parallel/engine.py | 56 +------------------ 1 file changed, 1 insertion(+), 55 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 1e1e37b4435ce..72a377603edc7 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -324,65 +324,11 @@ def _initialize(self, mode): # instantiate communication by process_mapping. all_process_groups = get_all_process_groups() - has_recv_by_socket = [] - # This is a magic number and the rank number for training is usually less than 5000 - magic_num = 5000 - genv = _get_global_env() - cur_rank_ip, cur_rank_port = genv.current_endpoint.split(":") - cur_rank_recv_port = int(cur_rank_port) + magic_num - server_socket = None - # Large enough for recv rank - buff_size = 1024 - server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - server_socket.bind((cur_rank_ip, cur_rank_recv_port)) - # The 10 is an empirical value - server_socket.listen(10) - client_sockets = {} + # NOTE: add the comm init control in the future for auto search for process_group in all_process_groups: if self._cur_rank not in process_group.ranks: continue - if len(process_group.ranks) == 2: - index = process_group.ranks.index(self._cur_rank) - is_send = True if index == 0 else False - if is_send: - recv_rank = process_group.ranks[1] - recv_rank_ip, recv_rank_port = genv.trainer_endpoints[ - recv_rank].split(":") - connect_port = int(recv_rank_port) + magic_num - client_socket = socket.socket(socket.AF_INET, - socket.SOCK_STREAM) - client_socket.connect((recv_rank_ip, connect_port)) - client_socket.send(str(self._cur_rank).encode('utf-8')) - rank = client_socket.recv(buff_size).decode('utf-8') - rank = int(rank) - if rank != recv_rank: - raise ValueError( - "Please check comm pair, the recv rank should be {} but got {}." - .format(recv_rank, rank)) - else: - print("It is able to instantiate {} as sender now.". - format(process_group.ranks)) - client_socket.close() - else: - send_rank = process_group.ranks[0] - while True: - if send_rank not in has_recv_by_socket: - client_socket, recv_addr = server_socket.accept( - ) - rank = int( - client_socket.recv(buff_size).decode()) - client_sockets[rank] = client_socket - has_recv_by_socket.append(rank) - else: - client_sockets[send_rank].send( - str(self._cur_rank).encode("utf-8")) - client_sockets[send_rank].close() - print( - "It is able to instantiate {} as recver now." - .format(process_group.ranks)) - break process_group.instantiate() - server_socket.close() self._place = _get_device() if isinstance(self._place, fluid.CUDAPlace): From fd6dcdfe5ee255d7a46ba51f6903ab9934c32529 Mon Sep 17 00:00:00 2001 From: ronnywang Date: Mon, 18 Jul 2022 10:17:04 +0800 Subject: [PATCH 231/250] [CustomDevice] remove unused file (#44358) --- paddle/phi/backends/c_comm_lib.cc | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 paddle/phi/backends/c_comm_lib.cc diff --git a/paddle/phi/backends/c_comm_lib.cc b/paddle/phi/backends/c_comm_lib.cc deleted file mode 100644 index 7f86ac6eff91f..0000000000000 --- a/paddle/phi/backends/c_comm_lib.cc +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/backends/c_comm_lib.h" - -namespace phi { -// Even this source file does not contains any code, it is better to keep this -// source file for cmake dependency. -} // namespace phi From b7db8457c994b87a58b9bf6ec5407b55acea32e2 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Mon, 18 Jul 2022 10:34:33 +0800 Subject: [PATCH 232/250] [Paddle-TRT] reshape fill_constant (#44314) * reshape fill_constant * commit * commit --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../tensorrt/convert/fill_constant_op.cc | 71 ++++ .../inference/tensorrt/convert/reshape_op.cc | 28 +- paddle/fluid/inference/tensorrt/op_teller.cc | 26 ++ .../test_trt_convert_fill_constant.py | 142 +++++++ .../ir/inference/test_trt_convert_reshape.py | 365 ++++++++++++++---- 7 files changed, 551 insertions(+), 83 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5e787394bce25..541c53c8dae64 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2089,6 +2089,7 @@ USE_TRT_CONVERTER(top_k) USE_TRT_CONVERTER(top_k_v2) USE_TRT_CONVERTER(squeeze2) USE_TRT_CONVERTER(unsqueeze2) +USE_TRT_CONVERTER(fill_constant) USE_TRT_CONVERTER(fused_token_prune) #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000) USE_TRT_CONVERTER(sparse_fc) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index ca91df902a9a1..519daba2747d4 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -69,6 +69,7 @@ list( top_k_op.cc squeeze2_op.cc unsqueeze2_op.cc + fill_constant_op.cc fused_token_prune_op.cc) if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8) diff --git a/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc new file mode 100644 index 0000000000000..53eb3f2c89732 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/fill_constant_op.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class FillConstantOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { + VLOG(4) + << "convert a fluid fill_constant op to tensorrt fill_constant layer"; + + framework::OpDesc op_desc(op, nullptr); + int dtype = BOOST_GET_CONST(int, op_desc.GetAttr("dtype")); + std::string str_value = + BOOST_GET_CONST(std::string, op_desc.GetAttr("str_value")); + std::vector shape = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("shape")); + std::unique_ptr out_tensor(new framework::Tensor()); + out_tensor->Resize(phi::make_ddim(shape)); + nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT; + void* trt_data = nullptr; + size_t trt_num; + if (dtype == 2 || dtype == 3) { // int,int64 + auto* tmp_ptr = out_tensor->mutable_data(platform::CPUPlace()); + for (int64_t i = 0; i < out_tensor->numel(); i++) + tmp_ptr[i] = std::stoi(str_value); + trt_dtype = nvinfer1::DataType::kINT32; + trt_data = static_cast(tmp_ptr); + } else if (dtype == 5) { // float + auto* tmp_ptr = out_tensor->mutable_data(platform::CPUPlace()); + for (int64_t i = 0; i < out_tensor->numel(); i++) + tmp_ptr[i] = std::stof(str_value); + trt_data = static_cast(tmp_ptr); + } + + trt_num = static_cast(out_tensor->numel()); + engine_->SetWeights("fill_constant_value", std::move(out_tensor)); + TensorRTEngine::Weight weight{trt_dtype, trt_data, trt_num}; + + nvinfer1::Dims trt_in_shape; + trt_in_shape.nbDims = shape.size(); + for (size_t i = 0; i < shape.size(); i++) trt_in_shape.d[i] = shape[i]; + nvinfer1::ILayer* layer = + TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_in_shape, weight.get()); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "fill_constant", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(fill_constant, FillConstantOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc index 00ee5503cc2e2..eec881eae8e18 100644 --- a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc @@ -35,14 +35,29 @@ class ReshapeOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + std::vector shape = BOOST_GET_CONST(std::vector, op_desc.GetAttr("shape")); int nbDims_num = shape.size(); nvinfer1::Dims reshape_dim; - if (engine_->with_dynamic_shape()) { // running the TRT Dynamic Shape mode - reshape_dim.nbDims = nbDims_num; - for (int i = 0; i < nbDims_num; ++i) { - reshape_dim.d[i] = shape[i]; + nvinfer1::ITensor* real_shape_tensor = nullptr; + std::vector concat_inputs; + bool one_input = false; + if (engine_->with_dynamic_shape()) { + if (op_desc.Inputs().find("ShapeTensor") != op_desc.Inputs().end() && + op_desc.Input("ShapeTensor").size() > 0) { + for (auto name : op_desc.Input("ShapeTensor")) + concat_inputs.push_back(engine_->GetITensor(name)); + real_shape_tensor = Concat(concat_inputs); + } else if (op_desc.Inputs().find("Shape") != op_desc.Inputs().end() && + op_desc.Input("Shape").size() > 0) { + real_shape_tensor = engine_->GetITensor(op_desc.Input("Shape")[0]); + } else { + reshape_dim.nbDims = nbDims_num; + for (int i = 0; i < nbDims_num; ++i) { + reshape_dim.d[i] = shape[i]; + } + one_input = true; } } else { // running the TRT Static Shape mode reshape_dim.nbDims = nbDims_num - 1; @@ -51,7 +66,10 @@ class ReshapeOpConverter : public OpConverter { } } auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); - layer->setReshapeDimensions(reshape_dim); + if (!engine_->with_dynamic_shape() || one_input) + layer->setReshapeDimensions(reshape_dim); + else + layer->setInput(1, *real_shape_tensor); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "reshape", {output_name}, test_mode); } diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index eaef331356575..05d0b41f14e1f 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -169,6 +169,7 @@ struct SimpleOpTypeSetTeller : public Teller { "transformer_input_convert", "recover_padding", "remove_padding", + "fill_constant", "squeeze2", "unsqueeze2"}; std::unordered_set teller_set{ @@ -274,6 +275,7 @@ struct SimpleOpTypeSetTeller : public Teller { "transformer_input_convert", "recover_padding", "remove_padding", + "fill_constant", "squeeze2", "unsqueeze2", "fused_token_prune"}; @@ -1448,6 +1450,27 @@ bool OpTeller::Tell(const framework::ir::Node* node, } } + if (op_type == "fill_constant") { + auto fill_constant_inputs = desc.Inputs(); + if (fill_constant_inputs.find("ValueTensor") != + fill_constant_inputs.end()) { + if (desc.Input("ValueTensor").size()) return false; + } + if (fill_constant_inputs.find("ShapeTensor") != + fill_constant_inputs.end()) { + if (desc.Input("ShapeTensor").size()) return false; + } + if (fill_constant_inputs.find("ShapeTensorList") != + fill_constant_inputs.end()) { + if (desc.Input("ShapeTensorList").size()) return false; + } + int dtype = BOOST_GET_CONST(int, desc.GetAttr("dtype")); + // only support int32, int64, float32 + if (!(dtype == 2 || dtype == 3 || dtype == 5)) { + return false; + } + } + if (op_type == "instance_norm") { if (with_dynamic_shape) { VLOG(3) << "trt instance_norm op does not support dynamic shape "; @@ -1801,6 +1824,9 @@ bool OpTeller::Tell(const framework::ir::Node* node, } if (op_type == "reshape" || op_type == "reshape2") { + if (with_dynamic_shape) { + return true; + } if (!desc.HasAttr("shape")) { return false; } diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py new file mode 100644 index 0000000000000..84ee70782acc2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py @@ -0,0 +1,142 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import unittest +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set + + +class TrtConvertSplitTest(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + + def generate_value_data(attrs: List[Dict[str, Any]]): + return np.array([1]).astype(np.int32) + + def generate_shape_data(attrs: List[Dict[str, Any]]): + return np.array([4, 23]).astype(np.int32) + + def generate_shapelist_data(attrs: List[Dict[str, Any]]): + return np.array([4]).astype(np.int32) + + for shape in [[2, 3, 4]]: + for num_input in [0, 1, 2, 3]: + for dtype in [5, 2, 3]: + for str_value in ["2", "23", "-1"]: + self.num_input = num_input + dics = [{ + "str_value": str_value, + "shape": shape, + "dtype": dtype + }, { + "axis": -1 + }] + dics_intput = [{ + "ValueTensor": ["value_data"] + }, { + "ShapeTensor": ["shape_data"], + }, { + "ShapeTensorList": ["shapeT1_data", "shapeT2_data"], + }, {}] + ops_config = [ + { + "op_type": "fill_constant", + "op_inputs": dics_intput[num_input], + "op_outputs": { + "Out": ["out_data"], + }, + "op_attrs": dics[0] + }, + ] + + def generate_input(): + return np.random.random([1, 1]).astype(np.float32) + + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "value_data": + TensorConfig(data_gen=partial( + generate_value_data, dics)), + "shape_data": + TensorConfig(data_gen=partial( + generate_shape_data, dics)), + "shapeT1_data": + TensorConfig(data_gen=partial( + generate_shapelist_data, dics)), + "shapeT2_data": + TensorConfig(data_gen=partial( + generate_shapelist_data, dics)), + }, + outputs=["out_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(attrs): + self.input_shape = [1, 1] + max_shape = list(self.input_shape) + min_shape = list(self.input_shape) + opt_shape = list(self.input_shape) + for i in range(len(self.input_shape)): + max_shape[i] = max_shape[i] + 1 + self.dynamic_shape.min_input_shape = {"Y_data": min_shape} + self.dynamic_shape.max_input_shape = {"Y_data": max_shape} + self.dynamic_shape.opt_input_shape = {"Y_data": opt_shape} + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + if (self.num_input < 3): + return 0, 6 + return 1, 5 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + # Don't test static shape + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + + def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py index e05a78e66b900..7902a35a9a6b4 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py @@ -48,12 +48,16 @@ def sample_program_configs(self): def generate_input1(attrs: List[Dict[str, Any]]): if self.dims == 4: + self.input_shape = [1, 2, 4, 6] return np.ones([1, 2, 4, 6]).astype(np.float32) elif self.dims == 3: + self.input_shape = [1, 8, 6] return np.ones([1, 8, 6]).astype(np.float32) elif self.dims == 2: + self.input_shape = [1, 48] return np.ones([1, 48]).astype(np.float32) elif self.dims == 1: + self.input_shape = [48] return np.ones([48]).astype(np.float32) def generate_weight1(attrs: List[Dict[str, Any]]): @@ -66,69 +70,36 @@ def generate_shapeT2_data(attrs: List[Dict[str, Any]]): return np.array([24]).astype(np.int32) for dims in [4, 3, 2, 1]: - for num_input in [0, 1, 2, 3]: - for shape in [[1, 6, 8], [1, 2, 4, 6], [1, 1, 0, 12], [1, 0, 6], - [1, -1, 12], [2, -1], [3, 16], [3, 4, 4], [48]]: - dics = [{ + for shape in [[1, 6, 8], [1, 2, 4, 6], [1, 1, 0, 12], [1, 0, 6], + [1, -1, 12], [2, -1], [3, 16], [3, 4, 4], [48], + [-1, 48]]: + dics = [ + { "shape": shape, - }, {}] - self.num_input = num_input - self.dims = dims - dics_intput = [{ - "X": ["reshape_input"], - "Shape": ["shape_data"], - "ShapeTensor": ["shapeT1_data", "shapeT2_data"], - }, { - "X": ["reshape_input"], - "Shape": ["shape_data"], - }, { - "X": ["reshape_input"], - "ShapeTensor": ["shapeT1_data", "shapeT2_data"], - }, { - "X": ["reshape_input"] - }] - - dics_weight = [{ - "shape_data": - TensorConfig(data_gen=partial(generate_weight1, dics)), - "shapeT1_data": - TensorConfig( - data_gen=partial(generate_shapeT1_data, dics)), - "shapeT2_data": - TensorConfig( - data_gen=partial(generate_shapeT2_data, dics)) - }, { - "shape_data": - TensorConfig(data_gen=partial(generate_weight1, dics)) - }, { - "shapeT1_data": - TensorConfig( - data_gen=partial(generate_shapeT1_data, dics)), - "shapeT2_data": - TensorConfig( - data_gen=partial(generate_shapeT2_data, dics)) - }, {}] - - ops_config = [{ - "op_type": "reshape", - "op_inputs": dics_intput[num_input], - "op_outputs": { - "Out": ["reshape_out"] - }, - "op_attrs": dics[0] - }] - ops = self.generate_op_config(ops_config) - program_config = ProgramConfig( - ops=ops, - weights=dics_weight[num_input], - inputs={ - "reshape_input": - TensorConfig( - data_gen=partial(generate_input1, dics)) - }, - outputs=["reshape_out"]) + }, + ] + self.dims = dims + dics_intput = [{"X": ["reshape_input"]}] + + ops_config = [{ + "op_type": "reshape", + "op_inputs": dics_intput[0], + "op_outputs": { + "Out": ["reshape_out"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "reshape_input": + TensorConfig(data_gen=partial(generate_input1, dics)) + }, + outputs=["reshape_out"]) - yield program_config + yield program_config def sample_predictor_configs( self, program_config) -> (paddle_infer.Config, List[int], float): @@ -169,22 +140,31 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): + # in static shape mode, here is consistent with op_teller.cc + if (not dynamic_shape): + if (attrs[0]['shape'][0] == 0): + return 1, 2 + elif (len(attrs[0]['shape']) == 1): + return 0, 3 + elif (np.prod(attrs[0]['shape'][1:]) == np.prod( + self.input_shape[1:])): + return 1, 2 + else: + return 0, 3 return 1, 2 attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) ] - if attrs[0]['shape'][0] > 1 and len(attrs[0]['shape']) > 1: - pass - else: - # for static_shape - clear_dynamic_shape() - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), 1e-5 + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 # for dynamic_shape generate_dynamic_shape(attrs) @@ -196,14 +176,243 @@ def generate_trt_nodes_num(attrs, dynamic_shape): attrs, True), 1e-5 def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + +# reshape having three inputs. +class TrtConvertReshapeTest2(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): - def teller1(program_config, predictor_config): - if len(program_config.weights) >= 1: - return True - return False + def generate_input1(attrs: List[Dict[str, Any]]): + if self.dims == 4: + return np.random.random([1, 2, 4, 6]).astype(np.float32) + elif self.dims == 3: + return np.random.random([1, 8, 6]).astype(np.float32) + elif self.dims == 2: + return np.random.random([1, 48]).astype(np.float32) + elif self.dims == 1: + return np.random.random([48]).astype(np.float32) - self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT, - "INPUT ShapeTensor and Shape NOT SUPPORT") + for dims in [4, 3, 2, 1]: + for shape in [[-1, 48]]: + dics = [{ + "shape": shape, + }, {}] + self.dims = dims + dics_intput = [ + { + "X": ["reshape_input"], + "ShapeTensor": ["shapeT1_data", "shapeT2_data"], + }, + ] + ops_config = [ + { + "op_type": "fill_constant", + "op_inputs": {}, + "op_outputs": { + "Out": ["shapeT1_data"] + }, + "op_attrs": { + "dtype": 2, + "str_value": "2", + "shape": [1], + }, + }, + { + "op_type": "fill_constant", + "op_inputs": {}, + "op_outputs": { + "Out": ["shapeT2_data"] + }, + "op_attrs": { + "dtype": 2, + "str_value": "24", + "shape": [1], + }, + }, + { + "op_type": "reshape", + "op_inputs": dics_intput[0], + "op_outputs": { + "Out": ["reshape_out"] + }, + "op_attrs": dics[0] + }, + ] + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "reshape_input": + TensorConfig(data_gen=partial(generate_input1, dics)) + }, + outputs=["reshape_out"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(): + if self.dims == 4: + self.dynamic_shape.min_input_shape = { + "reshape_input": [1, 2, 4, 6] + } + self.dynamic_shape.max_input_shape = { + "reshape_input": [4, 2, 4, 6] + } + self.dynamic_shape.opt_input_shape = { + "reshape_input": [1, 2, 4, 6] + } + elif self.dims == 3: + self.dynamic_shape.min_input_shape = { + "reshape_input": [1, 8, 6] + } + self.dynamic_shape.max_input_shape = { + "reshape_input": [4, 8, 6] + } + self.dynamic_shape.opt_input_shape = { + "reshape_input": [1, 8, 6] + } + elif self.dims == 2: + self.dynamic_shape.min_input_shape = {"reshape_input": [1, 48]} + self.dynamic_shape.max_input_shape = {"reshape_input": [4, 48]} + self.dynamic_shape.opt_input_shape = {"reshape_input": [1, 48]} + elif self.dims == 1: + self.dynamic_shape.min_input_shape = {"reshape_input": [48]} + self.dynamic_shape.max_input_shape = {"reshape_input": [48]} + self.dynamic_shape.opt_input_shape = {"reshape_input": [48]} + + # for dynamic_shape + generate_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 2), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), (1, 2), 1e-5 + + def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + +# reshape having 2 inputs. +class TrtConvertReshapeTest3(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + + def generate_input1(attrs: List[Dict[str, Any]]): + if self.dims == 4: + return np.random.random([1, 2, 12, 6]).astype(np.float32) + elif self.dims == 3: + return np.random.random([1, 8, 18]).astype(np.float32) + elif self.dims == 2: + return np.random.random([1, 144]).astype(np.float32) + elif self.dims == 1: + return np.random.random([144]).astype(np.float32) + + for dims in [4, 3, 2, 1]: + for shape in [[-1, 144]]: + dics = [{ + "shape": shape, + }, {}] + self.dims = dims + dics_intput = [ + { + "X": ["reshape_input"], + "shape_data": ["shape_data"], + }, + ] + ops_config = [ + { + "op_type": "fill_constant", + "op_inputs": {}, + "op_outputs": { + "Out": ["shape_data"] + }, + "op_attrs": { + "dtype": 2, + "str_value": "12", + "shape": [2], + }, + }, + { + "op_type": "reshape", + "op_inputs": dics_intput[0], + "op_outputs": { + "Out": ["reshape_out"] + }, + "op_attrs": dics[0] + }, + ] + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "reshape_input": + TensorConfig(data_gen=partial(generate_input1, dics)) + }, + outputs=["reshape_out"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(): + if self.dims == 4: + self.dynamic_shape.min_input_shape = { + "reshape_input": [1, 2, 12, 6] + } + self.dynamic_shape.max_input_shape = { + "reshape_input": [4, 2, 12, 6] + } + self.dynamic_shape.opt_input_shape = { + "reshape_input": [1, 2, 12, 6] + } + elif self.dims == 3: + self.dynamic_shape.min_input_shape = { + "reshape_input": [1, 8, 18] + } + self.dynamic_shape.max_input_shape = { + "reshape_input": [4, 8, 18] + } + self.dynamic_shape.opt_input_shape = { + "reshape_input": [1, 8, 18] + } + elif self.dims == 2: + self.dynamic_shape.min_input_shape = {"reshape_input": [1, 144]} + self.dynamic_shape.max_input_shape = {"reshape_input": [4, 144]} + self.dynamic_shape.opt_input_shape = {"reshape_input": [1, 144]} + elif self.dims == 1: + self.dynamic_shape.min_input_shape = {"reshape_input": [144]} + self.dynamic_shape.max_input_shape = {"reshape_input": [144]} + self.dynamic_shape.opt_input_shape = {"reshape_input": [144]} + + # for dynamic_shape + generate_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 2), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), (1, 2), 1e-5 + + def add_skip_trt_case(self): + pass def test(self): self.add_skip_trt_case() From 0fd974b42e157a5d7e442d8e7c880fffb6d19249 Mon Sep 17 00:00:00 2001 From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com> Date: Mon, 18 Jul 2022 10:51:31 +0800 Subject: [PATCH 233/250] set seed for uts (#44372) --- python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py index 60414b8de97a5..b283c80adfd9a 100644 --- a/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py +++ b/python/paddle/fluid/tests/unittests/test_linalg_lstsq_op.py @@ -30,6 +30,7 @@ def setUp(self): self.devices.append("gpu:0") self.generate_input() self.generate_output() + np.random.seed(2022) def init_config(self): self.dtype = 'float64' From 5c291737830e03cfd816d36056ba48e0fc1fbc35 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Mon, 18 Jul 2022 11:22:37 +0800 Subject: [PATCH 234/250] [Paddle-TRT] remove useless code in fc (#44382) * remove useless code in fc --- .../fluid/inference/tensorrt/convert/fc_op.cc | 100 +---- .../ir/inference/test_trt_convert_fc.py | 361 ++++++++++++++++++ 2 files changed, 377 insertions(+), 84 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 1bd9cf8712d98..0d61dc6d0ea96 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -333,91 +333,23 @@ class FcOpConverter : public OpConverter { if (!engine_->with_dynamic_shape()) { x_num_col_dims--; } - // If use tensorrt'oss, the x_dim and x_num_col_dims need change, and can - // not add Shuffle layer in ernie's multihead. - if (x_dim.nbDims == 4 && x_num_col_dims == 1) { - if (enable_int8 || support_int8) { - // add conv1x1 layer - nvinfer1::DimsHW nv_ksize(1, 1); - auto* fc_layer_int8 = TRT_ENGINE_ADD_LAYER(engine_, - Convolution, - *X, - n_output, - nv_ksize, - weight.get(), - bias.get()); - if (activation_type == "relu") { - fc_layer_int8->setName( - ("ernie_fc_op_int8: Convolution (Output: " + output_name + ")") - .c_str()); - PADDLE_ENFORCE_EQ( - op_desc.HasAttr("out_threshold"), - true, - platform::errors::InvalidArgument( - "must have out threshold in fc layers in int8 mode")); - float out_scale = 0; - if (enable_int8) { - out_scale = - BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); - } else { - out_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Out")); - } - engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), - out_scale); - nvinfer1::IActivationLayer* relu_layer_int8 = - TRT_ENGINE_ADD_LAYER(engine_, - Activation, - *(fc_layer_int8->getOutput(0)), - nvinfer1::ActivationType::kRELU); - RreplenishLayerAndOutput(relu_layer_int8, - "relu_after_ernie_fc_int8", - {output_name}, - test_mode); - } else { - RreplenishLayerAndOutput(fc_layer_int8, - "ernie_fc_op_int8: Convolution", - {output_name}, - test_mode); - } - } else { - // add fc layer - auto* fc_layer_float = TRT_ENGINE_ADD_LAYER( - engine_, FullyConnected, *X, n_output, weight.get(), bias.get()); - if (activation_type == "relu") { - fc_layer_float->setName( - ("ernie_fc_op_float: (Output: " + output_name + ")").c_str()); - nvinfer1::IActivationLayer* relu_layer_float = - TRT_ENGINE_ADD_LAYER(engine_, - Activation, - *(fc_layer_float->getOutput(0)), - nvinfer1::ActivationType::kRELU); - RreplenishLayerAndOutput(relu_layer_float, - "relu_after_ernie_fc_float", - {output_name}, - test_mode); - } else { - RreplenishLayerAndOutput( - fc_layer_float, "ernie_fc_op_float", {output_name}, test_mode); - } - } - } else { // need reshape input before and after fc - PADDLE_ENFORCE_GT( - x_dim.nbDims, - x_num_col_dims, - platform::errors::InvalidArgument( - "Params and input dims mismatch. Paddle-TRT FC " - "converter expects x_dim.nbDims > x_num_col_dims, but " - "x_dim.nbDims : %d, x_num_col_dims : %d.", - x_dim.nbDims, - x_num_col_dims)); - auto* reshape_before_fc_layer = - reshape_before_fc(X, x_dim, x_num_col_dims, output_name); - auto* reshape_itensor = reshape_before_fc_layer->getOutput(0); - if (enable_int8 || support_int8) { - engine_->SetTensorDynamicRange(reshape_itensor, in_scale); - } - regist_fc(reshape_itensor, n_output, weight, bias); + PADDLE_ENFORCE_GT( + x_dim.nbDims, + x_num_col_dims, + platform::errors::InvalidArgument( + "Params and input dims mismatch. Paddle-TRT FC " + "converter expects x_dim.nbDims > x_num_col_dims, but " + "x_dim.nbDims : %d, x_num_col_dims : %d.", + x_dim.nbDims, + x_num_col_dims)); + // need reshape input before and after fc + auto* reshape_before_fc_layer = + reshape_before_fc(X, x_dim, x_num_col_dims, output_name); + auto* reshape_itensor = reshape_before_fc_layer->getOutput(0); + if (enable_int8 || support_int8) { + engine_->SetTensorDynamicRange(reshape_itensor, in_scale); } + regist_fc(reshape_itensor, n_output, weight, bias); } }; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py new file mode 100644 index 0000000000000..9b6badf394e0b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py @@ -0,0 +1,361 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import unittest +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import os + + +class TrtConvertFcTest(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + # The output has diff between gpu and trt in CI windows + if (os.name == 'nt'): + return False + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(batch, attrs: List[Dict[str, Any]]): + return np.random.random([batch, 3, 64, (int)(attrs[0]["m"] / 2), + 2]).astype(np.float32) + + def generate_w(batch, attrs: List[Dict[str, Any]]): + return np.random.random([attrs[0]["m"], + attrs[0]["n"]]).astype(np.float32) + + def generate_bias(batch, attrs: List[Dict[str, Any]]): + return np.random.random([attrs[0]["n"]]).astype(np.float32) + + for batch in [1, 4]: + for [m, n] in [[32, 23]]: + dics = [ + { + "in_num_col_dims": 3, + # for my conveinence + "m": m, + "n": n, + }, + {} + ] + + ops_config = [ + { + "op_type": "fc", + "op_inputs": { + "Input": ["input_data"], + "W": ["w_data"], + "Bias": ["bias_data"] + }, + "op_outputs": { + "Out": ["output_data"] + }, + "op_attrs": dics[0] + }, + ] + + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "w_data": + TensorConfig(data_gen=partial(generate_w, batch, dics)), + "bias_data": + TensorConfig( + data_gen=partial(generate_bias, batch, dics)) + }, + inputs={ + "input_data": + TensorConfig( + data_gen=partial(generate_input1, batch, dics)), + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = { + "input_data": [1, 3, 32, 16, 2], + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 3, 64, 16, 2], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 3, 64, 16, 2], + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # # for static_shape + # clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), (1e-5, 1e-5) + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + + def test(self): + self.run_test() + + def test_quant(self): + self.run_test(quant=True) + + +class TrtConvertFcTest2(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + # The output has diff between gpu and trt in CI windows + if (os.name == 'nt'): + return False + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(batch, attrs: List[Dict[str, Any]]): + return np.random.random([batch, 3, 64, 14]).astype(np.float32) + + def generate_w(batch, attrs: List[Dict[str, Any]]): + return np.random.random([attrs[0]["m"], + attrs[0]["n"]]).astype(np.float32) + + def generate_bias(batch, attrs: List[Dict[str, Any]]): + return np.random.random([attrs[0]["n"]]).astype(np.float32) + + for batch in [1, 4]: + for [m, n] in [[14, 43]]: + dics = [ + { + "in_num_col_dims": 3, + # for my conveinence + "m": m, + "n": n, + }, + {} + ] + + ops_config = [ + { + "op_type": "fc", + "op_inputs": { + "Input": ["input_data"], + "W": ["w_data"], + "Bias": ["bias_data"] + }, + "op_outputs": { + "Out": ["output_data"] + }, + "op_attrs": dics[0] + }, + ] + + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "w_data": + TensorConfig(data_gen=partial(generate_w, batch, dics)), + "bias_data": + TensorConfig( + data_gen=partial(generate_bias, batch, dics)) + }, + inputs={ + "input_data": + TensorConfig( + data_gen=partial(generate_input1, batch, dics)), + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(): + self.dynamic_shape.min_input_shape = { + "input_data": [1, 3, 32, 14], + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 3, 64, 14], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 3, 64, 14], + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + # # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 2), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), (1, 2), (1e-5, 1e-5) + + # for dynamic_shape + generate_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 2), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), (1, 2), (1e-5, 1e-5) + + def test(self): + self.run_test() + + +# this is the special case when x_dim.nbDims == 4 && x_num_col_dims == 1 +class TrtConvertFcTest3(TrtLayerAutoScanTest): + + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + self.trt_param.workspace_size = 1073741824 + + def generate_input1(batch, attrs: List[Dict[str, Any]]): + return np.ones([batch, 14, 1, 2]).astype(np.float32) + + def generate_w(batch, attrs: List[Dict[str, Any]]): + return np.ones([attrs[0]["m"], attrs[0]["n"]]).astype(np.float32) + + def generate_bias(batch, attrs: List[Dict[str, Any]]): + return np.ones([attrs[0]["n"]]).astype(np.float32) + + for batch in [1, 4]: + for [m, n] in [[28, 43]]: + dics = [ + { + "in_num_col_dims": 1, + "Input_scale": 0.1, + "out_threshold": 0.1, + "enable_int8": True, + # for my conveinence + "m": m, + "n": n, + }, + {} + ] + + ops_config = [ + { + "op_type": "fc", + "op_inputs": { + "Input": ["input_data"], + "W": ["w_data"], + "Bias": ["bias_data"] + }, + "op_outputs": { + "Out": ["output_data"] + }, + "op_attrs": dics[0] + }, + ] + + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={ + "w_data": + TensorConfig(data_gen=partial(generate_w, batch, dics)), + "bias_data": + TensorConfig( + data_gen=partial(generate_bias, batch, dics)) + }, + inputs={ + "input_data": + TensorConfig( + data_gen=partial(generate_input1, batch, dics)), + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + + def generate_dynamic_shape(): + self.dynamic_shape.min_input_shape = { + "input_data": [1, 14, 1, 2], + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 14, 1, 2], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, 14, 1, 2], + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 2), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), (1, 2), (1e-5, 1e-5) + + # for dynamic_shape + generate_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 2), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), (1, 2), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Int8 + yield self.create_inference_config(), (1, 2), (1e-5, 1e-5) + + def test(self): + self.run_test() + + def test_quant(self): + self.run_test(quant=True) + + +if __name__ == "__main__": + unittest.main() From 7a85ced396b28ddf22e286a8f3c71a8eb1d65136 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Mon, 18 Jul 2022 11:24:38 +0800 Subject: [PATCH 235/250] [Paddle-TRT] Fix cast (#44312) * fix_cast * fix_cast * commit --- paddle/fluid/inference/tensorrt/op_teller.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 05d0b41f14e1f..894e44fda9496 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -2061,6 +2061,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, } if (op_type == "cast") { +// trt 6015 result in Windows ppyolo_mbv3 TRT fp32 diff +#if !IS_TRT_VERSION_GE(7000) + return false; +#endif int in_dtype = BOOST_GET_CONST(int, desc.GetAttr("in_dtype")); int out_dtype = BOOST_GET_CONST(int, desc.GetAttr("out_dtype")); if ((in_dtype == 4 || in_dtype == 5) && out_dtype == 4) { From 39e5dd2efbb6a9c07f8bd0aa4861d321e744aa28 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Mon, 18 Jul 2022 14:18:30 +0800 Subject: [PATCH 236/250] Polish jit layer cmakelists to hide some message (#44351) --- paddle/fluid/jit/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt index 75483ac6544f4..e15ef14e5dc59 100644 --- a/paddle/fluid/jit/CMakeLists.txt +++ b/paddle/fluid/jit/CMakeLists.txt @@ -34,9 +34,9 @@ if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") add_custom_target( jit_download_program - COMMAND wget -nc + COMMAND wget -nc -q https://paddle-ci.gz.bcebos.com/dy2st/multi_program_load.tar.gz - COMMAND tar zxvf multi_program_load.tar.gz) + COMMAND tar zxf multi_program_load.tar.gz) set(JIT_DEPS phi elementwise_add_op From 3c074de44b4e0f9265e1a37eec337f16ecd6f106 Mon Sep 17 00:00:00 2001 From: Wilber Date: Mon, 18 Jul 2022 14:30:41 +0800 Subject: [PATCH 237/250] Enable inference multi stream ci test (#44275) * test * update --- .../inference/tests/infer_ut/test_LeViT.cc | 124 +++++++++--------- .../kernels/funcs/concat_and_split_functor.cu | 2 +- 2 files changed, 64 insertions(+), 62 deletions(-) diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc index c36968b7ed6f8..056371b0ae662 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc @@ -179,67 +179,69 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) { } #ifdef PADDLE_WITH_GPU -// TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) { -// int thread_num = 4; - -// // init stream -// std::vector streams(thread_num); -// for (size_t i = 0; i < thread_num; ++i) { -// cudaStreamCreate(&streams[i]); -// } - -// // init input data -// std::map my_input_data_map; -// my_input_data_map["x"] = PrepareInput(2); -// // init output data -// std::map infer_output_data, -// truth_output_data; -// // prepare groudtruth config -// paddle_infer::Config config, config_no_ir; -// config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", -// FLAGS_modeldir + "/inference.pdiparams"); -// config_no_ir.SwitchIrOptim(false); -// // prepare inference config -// config.SetModel(FLAGS_modeldir + "/inference.pdmodel", -// FLAGS_modeldir + "/inference.pdiparams"); -// config.EnableUseGpu(100, 0); -// config.EnableTensorRtEngine( -// 1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false); -// // get groudtruth by disbale ir - -// paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); -// SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, -// &truth_output_data, 1); - -// // get infer results from multi threads -// std::vector threads; -// config.SetExecStream(streams[0]); -// config.pass_builder()->DeletePass("add_support_int8_pass"); -// auto main_predictor = CreatePredictor(config); -// std::vector predictors; -// for (size_t i = 0; i < thread_num - 1; ++i) { -// predictors.push_back(std::move(main_predictor->Clone(streams[i + 1]))); -// LOG(INFO) << "predictors[" << i << "] stream is " -// << predictors[i]->GetExecStream(); -// } -// predictors.push_back(std::move(main_predictor)); -// LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is " -// << predictors[thread_num - 1]->GetExecStream(); -// for (int i = 0; i < thread_num; ++i) { -// threads.emplace_back(paddle::test::SingleThreadPrediction, -// predictors[i].get(), &my_input_data_map, -// &infer_output_data, 10); -// } - -// // thread join & check outputs -// for (int i = 0; i < thread_num; ++i) { -// LOG(INFO) << "join tid : " << i; -// threads[i].join(); -// CompareRecord(&truth_output_data, &infer_output_data); -// } - -// std::cout << "finish multi-thread test" << std::endl; -// } +TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) { + int thread_num = 4; + + // init stream + std::vector streams(thread_num); + for (size_t i = 0; i < thread_num; ++i) { + cudaStreamCreate(&streams[i]); + } + + // init input data + std::map my_input_data_map; + my_input_data_map["x"] = PrepareInput(2); + // init output data + std::map infer_output_data, + truth_output_data; + // prepare groudtruth config + paddle_infer::Config config, config_no_ir; + config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config_no_ir.SwitchIrOptim(false); + // prepare inference config + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + config.EnableUseGpu(100, 0); + config.EnableTensorRtEngine( + 1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false); + // get groudtruth by disbale ir + + paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); + SingleThreadPrediction( + pred_pool_no_ir.Retrive(0), &my_input_data_map, &truth_output_data, 1); + + // get infer results from multi threads + std::vector threads; + config.SetExecStream(streams[0]); + config.pass_builder()->DeletePass("add_support_int8_pass"); + auto main_predictor = CreatePredictor(config); + std::vector predictors; + for (size_t i = 0; i < thread_num - 1; ++i) { + predictors.push_back(std::move(main_predictor->Clone(streams[i + 1]))); + LOG(INFO) << "predictors[" << i << "] stream is " + << predictors[i]->GetExecStream(); + } + predictors.push_back(std::move(main_predictor)); + LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is " + << predictors[thread_num - 1]->GetExecStream(); + for (int i = 0; i < thread_num; ++i) { + threads.emplace_back(paddle::test::SingleThreadPrediction, + predictors[i].get(), + &my_input_data_map, + &infer_output_data, + 10); + } + + // thread join & check outputs + for (int i = 0; i < thread_num; ++i) { + LOG(INFO) << "join tid : " << i; + threads[i].join(); + CompareRecord(&truth_output_data, &infer_output_data); + } + + std::cout << "finish multi-thread test" << std::endl; +} #endif } // namespace paddle_infer diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index dbcd4016170d5..01701ee287385 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" -#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" namespace phi { namespace funcs { From 74412dfefff946551f6f38cf41c58b37ba0710f8 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Mon, 18 Jul 2022 14:47:48 +0800 Subject: [PATCH 238/250] fix bug of old pp (#44361) --- python/paddle/distributed/collective.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 2506c3073941a..62b18298f11e0 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -2030,6 +2030,10 @@ def alltoall_single(in_tensor, return task +def _get_group_rank(global_rank, group=None): + return global_rank if group is None else group.get_group_rank(global_rank) + + def send(tensor, dst=0, group=None, use_calc_stream=True): """ Send a tensor to the receiver. @@ -2062,11 +2066,10 @@ def send(tensor, dst=0, group=None, use_calc_stream=True): """ if group is not None and not group.is_member(): return - + dst = _get_group_rank(dst, group) if in_dygraph_mode(): group = _get_default_group() if group is None else group - group_dst_rank = group.get_group_rank(dst) - task = group.process_group.send(tensor, group_dst_rank) + task = group.process_group.send(tensor, dst) if use_calc_stream: task.wait() return None @@ -2126,10 +2129,10 @@ def recv(tensor, src=0, group=None, use_calc_stream=True): if group is not None and not group.is_member(): return + src = _get_group_rank(src, group) if in_dygraph_mode(): group = _get_default_group() if group is None else group - group_src_rank = group.get_group_rank(src) - task = group.process_group.recv(tensor, group_src_rank) + task = group.process_group.recv(tensor, src) if use_calc_stream: task.wait() return None From 02e9453fa277b58007674f559e6b7c45561a41e7 Mon Sep 17 00:00:00 2001 From: QingshuChen Date: Mon, 18 Jul 2022 15:02:11 +0800 Subject: [PATCH 239/250] add xpu resnet_unit (#44297) * add xpu resnet_unit *test=kunlun * tmp *test=kunlun --- cmake/external/xpu.cmake | 4 +- paddle/fluid/operators/fused/CMakeLists.txt | 1 + .../fluid/operators/fused/resnet_unit_op.cc | 26 +- .../operators/fused/resnet_unit_op_xpu.cc | 333 ++++++++++++++++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 3 + .../unittests/xpu/get_test_cover_info.py | 4 +- .../paddle/incubate/operators/resnet_unit.py | 22 +- 7 files changed, 376 insertions(+), 17 deletions(-) create mode 100644 paddle/fluid/operators/fused/resnet_unit_op_xpu.cc diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index c1f8eb0e33c79..81128ccf3b6a0 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") if(NOT DEFINED XPU_BASE_URL) set(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220712") + set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220718") else() set(XPU_BASE_URL "${XPU_BASE_URL}") endif() @@ -19,7 +19,7 @@ endif() if(NOT DEFINED XPU_XDNN_BASE_URL) set(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev") - set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220712") + set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220718") else() set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}") endif() diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index dfbdaed87614f..02a3f4d7a0eb6 100755 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -35,6 +35,7 @@ op_library(fusion_lstm_op) if(WITH_XPU) op_library(resnet_basic_block_op) + op_library(resnet_unit_op) endif() if(WITH_GPU OR WITH_ROCM) diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc index 4f4e0aa6ac29a..5852a5c04bde6 100644 --- a/paddle/fluid/operators/fused/resnet_unit_op.cc +++ b/paddle/fluid/operators/fused/resnet_unit_op.cc @@ -159,22 +159,28 @@ class ResNetUnitOp : public framework::OperatorWithKernel { bn_param_dims, bn_param_dims.size())); auto data_format = ctx->Attrs().Get("data_format"); - PADDLE_ENFORCE_EQ( - data_format, - "NHWC", - platform::errors::InvalidArgument("The data format must equal to NHWC. " - "But received: the data format " - "= [%s]", - data_format)); + bool is_nchw = (data_format == "NCHW"); // Calculate the dims of outputs int batch = x_dims[0]; int output_channel = w_dims[0]; int filter_size = w_dims[2]; int stride = ctx->Attrs().Get("stride"); int padding = ctx->Attrs().Get("padding"); - int out_h = (x_dims[1] + padding * 2 - filter_size) / stride + 1; - int out_w = (x_dims[2] + padding * 2 - filter_size) / stride + 1; - std::vector out_shape = {batch, out_h, out_w, output_channel}; + std::vector out_shape; + out_shape.push_back(batch); + if (is_nchw) { + int out_h = (x_dims[2] + padding * 2 - filter_size) / stride + 1; + int out_w = (x_dims[3] + padding * 2 - filter_size) / stride + 1; + out_shape.push_back(output_channel); + out_shape.push_back(out_h); + out_shape.push_back(out_w); + } else { + int out_h = (x_dims[1] + padding * 2 - filter_size) / stride + 1; + int out_w = (x_dims[2] + padding * 2 - filter_size) / stride + 1; + out_shape.push_back(out_h); + out_shape.push_back(out_w); + out_shape.push_back(output_channel); + } auto y_dims = phi::make_ddim(out_shape); auto bitmask_dims = GetBitmaskDims(out_shape); diff --git a/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc new file mode 100644 index 0000000000000..cce506c67abe2 --- /dev/null +++ b/paddle/fluid/operators/fused/resnet_unit_op_xpu.cc @@ -0,0 +1,333 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ResNetUnitXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_EQ( + platform::is_xpu_place(place), + true, + platform::errors::PreconditionNotMet("It must use XPUPlace.")); + + bool is_nchw = (ctx.Attr("data_format") == "NCHW"); + // input x + const Tensor *input_x = ctx.Input("X"); + const Tensor *filter_x = ctx.Input("FilterX"); + const Tensor *scale_x = ctx.Input("ScaleX"); + const Tensor *bias_x = ctx.Input("BiasX"); + + // output x + Tensor *conv_out_x = ctx.Output("ConvX"); + Tensor *saved_mean_x = ctx.Output("SavedMeanX"); + Tensor *saved_invstd_x = ctx.Output("SavedInvstdX"); + Tensor *running_mean_x = ctx.Output("RunningMeanX"); + Tensor *running_var_x = ctx.Output("RunningVarX"); + + Tensor *output = ctx.Output("Y"); + + // attrs + int padding = ctx.Attr("padding"); + int stride = ctx.Attr("stride"); + int stride_z = ctx.Attr("stride_z"); + int dilation = ctx.Attr("dilation"); + int group = ctx.Attr("group"); + float eps = ctx.Attr("epsilon"); + float momentum = ctx.Attr("momentum"); + bool has_shortcut = ctx.Attr("has_shortcut"); + bool fuse_add = ctx.Attr("fuse_add"); + bool use_global_stats = ctx.Attr("use_global_stats"); + bool is_test = ctx.Attr("is_test"); + bool is_train = !is_test && !use_global_stats; + std::string act_type = ctx.Attr("act_type"); + auto &dev_ctx = ctx.template device_context(); + + std::vector x_list = {input_x->data()}; + std::vector w_list = {filter_x->data()}; + std::vector conv_y_list = {conv_out_x->mutable_data(place)}; + + std::vector> x_shape_list = { + phi::vectorize(input_x->dims())}; + + auto filter_x_shape = phi::vectorize(filter_x->dims()); + std::vector ksize = {filter_x_shape[2], filter_x_shape[3]}; + if (!is_nchw) { + ksize[0] = filter_x_shape[1]; + ksize[1] = filter_x_shape[2]; + } + std::vector strides = {stride, stride}; + std::vector> ksize_list = {ksize}; + std::vector> stride_list = {strides}; + std::vector paddings = {padding, padding}; + std::vector dilations = {dilation, dilation}; + std::vector scale_list = {scale_x->data()}; + std::vector bias_list = {bias_x->data()}; + std::vector batch_mean_list = { + saved_mean_x->mutable_data(place)}; + std::vector batch_invstd_list = { + saved_invstd_x->mutable_data(place)}; + std::vector global_mean_list = { + running_mean_x->mutable_data(place)}; + std::vector global_var_list = { + running_var_x->mutable_data(place)}; + + std::vector x_maxlist = {nullptr}; + std::vector w_maxlist = {nullptr}; + if (has_shortcut) { + // input z + const Tensor *input_z = ctx.Input("Z"); + const Tensor *filter_z = ctx.Input("FilterZ"); + const Tensor *scale_z = ctx.Input("ScaleZ"); + const Tensor *bias_z = ctx.Input("BiasZ"); + + Tensor *conv_out_z = ctx.Output("ConvZ"); + Tensor *saved_mean_z = ctx.Output("SavedMeanZ"); + Tensor *saved_invstd_z = ctx.Output("SavedInvstdZ"); + Tensor *running_mean_z = ctx.Output("RunningMeanZ"); + Tensor *running_var_z = ctx.Output("RunningVarZ"); + + x_list.push_back(input_z->data()); + w_list.push_back(filter_z->data()); + conv_y_list.push_back(conv_out_z->mutable_data(place)); + + x_shape_list.push_back(phi::vectorize(input_z->dims())); + + auto filter_z_shape = phi::vectorize(filter_z->dims()); + std::vector ksize_z = {filter_z_shape[2], filter_z_shape[3]}; + if (!is_nchw) { + ksize_z[0] = filter_z_shape[1]; + ksize_z[1] = filter_z_shape[2]; + } + ksize_list.push_back(ksize_z); + stride_list.push_back({stride_z, stride_z}); + scale_list.push_back(scale_z->data()); + bias_list.push_back(bias_z->data()); + batch_mean_list.push_back(saved_mean_z->mutable_data(place)); + batch_invstd_list.push_back(saved_invstd_z->mutable_data(place)); + global_mean_list.push_back(running_mean_z->mutable_data(place)); + global_var_list.push_back(running_var_z->mutable_data(place)); + x_maxlist.push_back(nullptr); + w_maxlist.push_back(nullptr); + } else { + if (fuse_add) { + const Tensor *input_z = ctx.Input("Z"); + auto input_z_shape = phi::vectorize(input_z->dims()); + x_list.push_back(input_z->data()); + x_shape_list.push_back(input_z_shape); + x_maxlist.push_back(nullptr); + } + } + int r = xpu::resnet_unit_fusion( + dev_ctx.x_context(), + x_list, + w_list, + conv_y_list, + output->mutable_data(place), + x_shape_list, + filter_x_shape[0], + ksize_list, + stride_list, + paddings, + dilations, + group, + eps, + momentum, + x_maxlist, + w_maxlist, + scale_list, + bias_list, + batch_mean_list, + batch_invstd_list, + global_mean_list, + global_var_list, + xpu::Activation_t::RELU, + is_nchw, + has_shortcut, + fuse_add, + is_train); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_fusion"); + } +}; + +template +class ResNetUnitGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_EQ( + platform::is_xpu_place(place), + true, + platform::errors::PreconditionNotMet("It must use XPUPlace.")); + + bool is_nchw = (ctx.Attr("data_format") == "NCHW"); + const Tensor *y_grad = ctx.Input(framework::GradVarName("Y")); + const Tensor *x = ctx.Input("X"); + const Tensor *filter_x = ctx.Input("FilterX"); + const Tensor *scale_x = ctx.Input("ScaleX"); + const Tensor *saved_mean_x = ctx.Input("SavedMeanX"); + const Tensor *saved_invstd_x = ctx.Input("SavedInvstdX"); + const Tensor *conv_out_x = ctx.Input("ConvX"); + const Tensor *output = ctx.Input("Y"); + + Tensor *x_grad = ctx.Output(framework::GradVarName("X")); + Tensor *filter_x_grad = + ctx.Output(framework::GradVarName("FilterX")); + Tensor *scale_x_grad = ctx.Output(framework::GradVarName("ScaleX")); + Tensor *bias_x_grad = ctx.Output(framework::GradVarName("BiasX")); + + int padding = ctx.Attr("padding"); + int stride = ctx.Attr("stride"); + int stride_z = ctx.Attr("stride_z"); + int dilation = ctx.Attr("dilation"); + int group = ctx.Attr("group"); + float eps = ctx.Attr("epsilon"); + bool has_shortcut = ctx.Attr("has_shortcut"); + bool fuse_add = ctx.Attr("fuse_add"); + std::string act_type = ctx.Attr("act_type"); + + auto &dev_ctx = ctx.template device_context(); + + std::vector x_list = {x->data()}; + std::vector w_list = {filter_x->data()}; + std::vector conv_y_list = {conv_out_x->data()}; + std::vector dx_list = {x_grad->mutable_data(place)}; + std::vector dw_list = {filter_x_grad->mutable_data(place)}; + + std::vector> x_shape_list = { + phi::vectorize(x->dims())}; + + auto filter_x_shape = phi::vectorize(filter_x->dims()); + std::vector x_ksize = {filter_x_shape[2], filter_x_shape[3]}; + if (!is_nchw) { + x_ksize[0] = filter_x_shape[1]; + x_ksize[1] = filter_x_shape[2]; + } + std::vector> ksize_list = {x_ksize}; + std::vector> stride_list = {{stride, stride}}; + std::vector paddings = {padding, padding}; + std::vector dilations = {dilation, dilation}; + + std::vector x_maxlist = {nullptr}; + std::vector w_maxlist = {nullptr}; + + std::vector scale_list = {scale_x->data()}; + std::vector batch_mean_list = {saved_mean_x->data()}; + std::vector batch_invstd_list = { + saved_invstd_x->data()}; + std::vector dscale_list = { + scale_x_grad->mutable_data(place)}; + std::vector dbias_list = {bias_x_grad->mutable_data(place)}; + + if (has_shortcut) { + // X Z + // | | + // NormConv NormConv + // | | + // BNStatsFinalize BNStatsFinalize + // \ / + // ScaleBiasAddRelu + // | + // Y + const Tensor *z = ctx.Input("Z"); + const Tensor *filter_z = ctx.Input("FilterZ"); + const Tensor *scale_z = ctx.Input("ScaleZ"); + const Tensor *saved_mean_z = ctx.Input("SavedMeanZ"); + const Tensor *saved_invstd_z = ctx.Input("SavedInvstdZ"); + const Tensor *conv_out_z = ctx.Input("ConvZ"); + + Tensor *z_grad = ctx.Output(framework::GradVarName("Z")); + Tensor *filter_z_grad = + ctx.Output(framework::GradVarName("FilterZ")); + Tensor *scale_z_grad = + ctx.Output(framework::GradVarName("ScaleZ")); + Tensor *bias_z_grad = ctx.Output(framework::GradVarName("BiasZ")); + x_list.push_back(z->data()); + w_list.push_back(filter_z->data()); + conv_y_list.push_back(conv_out_z->data()); + dx_list.push_back(z_grad->mutable_data(place)); + dw_list.push_back(filter_z_grad->mutable_data(place)); + x_shape_list.push_back(phi::vectorize(z->dims())); + + auto filter_z_shape = phi::vectorize(filter_z->dims()); + std::vector ksize_z = {filter_z_shape[2], filter_z_shape[3]}; + if (!is_nchw) { + ksize_z[0] = filter_z_shape[1]; + ksize_z[1] = filter_z_shape[2]; + } + ksize_list.push_back(ksize_z); + stride_list.push_back({stride_z, stride_z}); + x_maxlist.push_back(nullptr); + w_maxlist.push_back(nullptr); + + scale_list.push_back(scale_z->data()); + batch_mean_list.push_back(saved_mean_z->data()); + batch_invstd_list.push_back(saved_invstd_z->data()); + dscale_list.push_back(scale_z_grad->mutable_data(place)); + dbias_list.push_back(bias_z_grad->mutable_data(place)); + } else { + if (fuse_add) { + auto z_grad = ctx.Output(framework::GradVarName("Z")); + dx_list.push_back(z_grad->mutable_data(place)); + } + } + + int r = + xpu::resnet_unit_grad_fusion(dev_ctx.x_context(), + x_list, + w_list, + y_grad->data(), + output->data(), + conv_y_list, + dx_list, + dw_list, + x_shape_list, + filter_x_shape[0], + ksize_list, + stride_list, + paddings, + dilations, + group, + x_maxlist, + w_maxlist, + scale_list, + batch_mean_list, + batch_invstd_list, + dscale_list, + dbias_list, + xpu::Activation_t::RELU, + eps, + is_nchw, + has_shortcut, + fuse_add); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_grad_fusion"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_XPU_KERNEL(resnet_unit, ops::ResNetUnitXPUKernel); +REGISTER_OP_XPU_KERNEL(resnet_unit_grad, ops::ResNetUnitGradXPUKernel); diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index bd5957a122885..8cae8cfe534ef 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -374,6 +374,9 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::INT32, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::FP32, XPUPlace())})}, + {"resnet_unit", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"resnet_unit_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py index f58c0d4cf074c..bcaa8055b25cd 100644 --- a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py +++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py @@ -87,7 +87,9 @@ xpu_test_op_type_white_list = [ 'dropout_float16', 'dropout_grad_float16', - "grad_add_float32" # no api for grad_add, skip + "grad_add_float32", # no api for grad_add, skip + "resnet_unit", + "resnet_unit_grad" ] xpu_test_device_op_white_list = [] xpu_test_device_op_type_white_list = [] diff --git a/python/paddle/incubate/operators/resnet_unit.py b/python/paddle/incubate/operators/resnet_unit.py index 6333ddafe1096..70abe41f62462 100644 --- a/python/paddle/incubate/operators/resnet_unit.py +++ b/python/paddle/incubate/operators/resnet_unit.py @@ -170,7 +170,7 @@ def __init__(self, self._is_test = is_test # check format - valid_format = {'NHWC'} + valid_format = {'NHWC', 'NCHW'} if data_format not in valid_format: raise ValueError( "conv_format must be one of {}, but got conv_format='{}'". @@ -181,11 +181,25 @@ def _get_default_param_initializer(channels): std = (2.0 / filter_elem_num)**0.5 return I.Normal(0.0, std) + is_nchw = (data_format == 'NCHW') # initial filter bn_param_dtype = fluid.core.VarDesc.VarType.FP32 - bn_param_shape = [1, 1, 1, num_filters] - filter_x_shape = [num_filters, filter_size, filter_size, num_channels_x] - filter_z_shape = [num_filters, filter_size, filter_size, num_channels_z] + if not is_nchw: + bn_param_shape = [1, 1, 1, num_filters] + filter_x_shape = [ + num_filters, filter_size, filter_size, num_channels_x + ] + filter_z_shape = [ + num_filters, filter_size, filter_size, num_channels_z + ] + else: + bn_param_shape = [1, num_filters, 1, 1] + filter_x_shape = [ + num_filters, num_channels_x, filter_size, filter_size + ] + filter_z_shape = [ + num_filters, num_channels_z, filter_size, filter_size + ] self.filter_x = self.create_parameter( shape=filter_x_shape, From b83138d0d2a8f729194e54b298f65d7fde1b30ce Mon Sep 17 00:00:00 2001 From: levi131 <83750468+levi131@users.noreply.github.com> Date: Mon, 18 Jul 2022 15:13:27 +0800 Subject: [PATCH 240/250] add blacklist in prim2orig interface (#44383) --- .../unittests/autograd/test_transform.py | 44 +++++++++++++++++++ python/paddle/incubate/autograd/primx.py | 17 ++++--- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_transform.py b/python/paddle/fluid/tests/unittests/autograd/test_transform.py index 08626593e2904..f976ef729cc7a 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_transform.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_transform.py @@ -88,6 +88,12 @@ def init_data(self): 'mul_p', 'mul_p' ] + self.prim2orig_ops_with_blacklist = [ + 'tanh', 'tanh', 'add_p', 'fill_constant', 'fill_constant', + 'fill_constant', 'elementwise_mul', 'sub_p', 'fill_constant', + 'elementwise_mul', 'sub_p', 'fill_constant', 'elementwise_mul', + 'elementwise_mul' + ] self.prim2orig_ops = [ 'tanh', 'tanh', 'elementwise_add', 'fill_constant', 'fill_constant', 'fill_constant', 'elementwise_mul', 'elementwise_sub', @@ -132,6 +138,13 @@ def test_run(self): for k, v in self.ys_shape_map.items(): self.assertEqual(flatten_ys_bar[k].shape, v) + # Test prim2orig with blacklist + prim2orig(block=self.main_program.block(0), + blacklist=['add_p', 'sub_p']) + prim2orig_ops = [op.type for op in self.main_program.block(0).ops] + self.assertEqual(sorted(prim2orig_ops), + sorted(self.prim2orig_ops_with_blacklist)) + # Test prim2orig prim2orig(block=self.main_program.block(0)) prim2orig_ops = [op.type for op in self.main_program.block(0).ops] @@ -198,6 +211,26 @@ def init_data(self): 'reshape_p', ] + self.prim2orig_ops_with_blacklist = [ + 'reshape2', + 'fill_constant', + 'fill_constant', + 'fill_constant', + 'elementwise_mul', + 'add_p', + 'matmul_v2', + 'fill_constant', + 'fill_constant', + 'fill_constant', + 'elementwise_mul', + 'transpose2', + 'matmul_v2', + 'transpose2', + 'matmul_v2', + # 'elementwise_mul', + 'reshape2', + ] + self.prim2orig_ops = [ 'reshape2', 'fill_constant', @@ -312,6 +345,17 @@ def init_data(self): 'add_p', ] + self.prim2orig_ops_with_blacklist = [ + 'expand_v2', 'add_p', 'reshape2', 'elementwise_mul', 'reduce_sum', + 'sqrt', 'expand_v2', 'sub_p', 'concat', 'gather', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'elementwise_mul', 'reduce_sum', 'reshape2', + 'reshape2', 'elementwise_mul', 'elementwise_mul', 'reshape2', + 'expand_v2', 'elementwise_div', 'reduce_sum', 'reshape2', + 'fill_constant', 'sub_p', 'split', 'fill_constant', 'fill_any_like', + 'add_p', 'scatter', 'elementwise_add', 'add_p' + ] + self.prim2orig_ops = [ 'expand_v2', 'elementwise_add', 'reshape2', 'elementwise_mul', 'reduce_sum', 'sqrt', 'expand_v2', 'elementwise_sub', 'concat', diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py index 260a97cdc16a4..19f87dd929215 100644 --- a/python/paddle/incubate/autograd/primx.py +++ b/python/paddle/incubate/autograd/primx.py @@ -408,7 +408,7 @@ def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False): # TODO(lml): supporting control flow, nested blocks, and block other than current block of main program. -def _lower(block, reverse): +def _lower(block, reverse, blacklist): # Some functions which are only used in _lower. def bind(args, to_bind, value_table): for i in range(len(args)): @@ -452,7 +452,7 @@ def expand_nested_list(xs): for op_idx in range(len(block.ops)): op = block.ops[op_idx] ops_to_remove.append(op_idx) - if lookup_fn(op.type) is not None: + if lookup_fn(op.type) is not None and op.type not in blacklist: input_args = get_input_var_list(op) bind(input_args, to_bind, value_table) @@ -535,11 +535,11 @@ def orig2prim(block=None): block = default_main_program().current_block() if block is None else block assert block == default_main_program().current_block( ), f'block is neither None nor current block of main program' - _lower(block, reverse=False) + _lower(block, reverse=False, blacklist=[]) @framework.static_only -def prim2orig(block=None): +def prim2orig(block=None, blacklist=None): """ .. note:: **ONLY available in the static mode.** @@ -554,7 +554,11 @@ def prim2orig(block=None): block(paddle.static.Block|None, optional): The target block to process on. Default None, and will process on the current block of main program. - + blacklist(list[string]|None, optional): The names of automatic + differential basic operator that will not be transformed + into original operators. Default None, and the blacklist + is treated as empty list. + Examples: .. code-block:: python @@ -576,4 +580,5 @@ def prim2orig(block=None): block = default_main_program().current_block() if block is None else block assert block == default_main_program().current_block( ), f'block is neither None nor current block of main program' - _lower(block, reverse=True) + blacklist = [] if blacklist is None else blacklist + _lower(block, reverse=True, blacklist=blacklist) From 04e55582d265fe3795624a7dff43977aac0574b1 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Mon, 18 Jul 2022 15:14:47 +0800 Subject: [PATCH 241/250] [Plugin] Fix Custom device in eager mode, test=develop (#43952) * [Plugin] Fix Custom device in eager mode, test=develop * update test case, test=develop * update ut for coverage, test=develop --- CMakeLists.txt | 7 ++++- paddle/phi/core/tensor_utils.cc | 4 +++ .../fluid/tests/custom_kernel/CMakeLists.txt | 17 +++++++++-- .../custom_kernel_dot_c_setup.py | 4 +-- .../custom_kernel/custom_kernel_dot_setup.py | 4 +-- .../custom_kernel/test_custom_kernel_dot.py | 14 --------- .../fluid/tests/custom_runtime/CMakeLists.txt | 11 +++++-- .../custom_runtime/test_custom_cpu_plugin.py | 29 ++++++++++++++----- 8 files changed, 59 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ea4bc8a2d6c3e..78ebbccfb2e7a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -336,7 +336,12 @@ endif() if(LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER) - set(WITH_CUSTOM_DEVICE ON) + set(WITH_CUSTOM_DEVICE + ON + CACHE BOOL "Enable Custom Device when compiling for Linux" FORCE) + message( + "Enable Custom Device when compiling for Linux. Force WITH_CUSTOM_DEVICE=ON." + ) endif() if(WIN32) diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc index e2d8d5a03651c..45f6c00affe05 100644 --- a/paddle/phi/core/tensor_utils.cc +++ b/paddle/phi/core/tensor_utils.cc @@ -53,6 +53,10 @@ void Copy(const Context& dev_ctx, #ifdef PADDLE_WITH_XPU } else if (paddle::platform::is_xpu_place(dst_place)) { dst_ptr = dev_ctx.Alloc(dst, src.dtype()); +#endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + } else if (paddle::platform::is_custom_place(dst_place)) { + dst_ptr = dev_ctx.Alloc(dst, src.dtype()); #endif } diff --git a/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt b/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt index b2bdfac908069..af700c22038e3 100644 --- a/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt @@ -1,2 +1,15 @@ -py_test(test_custom_kernel_dot SRCS test_custom_kernel_dot.py) -py_test(test_custom_kernel_load SRCS test_custom_kernel_load.py) +file( + GLOB TEST_OPS + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +set(CUSTOM_ENVS + PADDLE_SOURCE_DIR=${PADDLE_SOURCE_DIR} + PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} + CUSTOM_DEVICE_ROOT=${CMAKE_BINARY_DIR}/python/paddle/fluid/tests/custom_kernel +) + +foreach(TEST_OP ${TEST_OPS}) + py_test(${TEST_OP} SRCS ${TEST_OP}.py ENVS ${CUSTOM_ENVS}) +endforeach() diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py index 11fdc9d0addfa..e162daf2b87e1 100644 --- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py +++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py @@ -48,8 +48,8 @@ def build_extensions(self): os.path.join(site_packages_path, 'paddle', 'include'), ] # include path third_party -compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'], - 'build/third_party') +compile_third_party_path = os.path.join(os.environ['PADDLE_BINARY_DIR'], + 'third_party') paddle_custom_kernel_include += [ os.path.join(compile_third_party_path, 'install/gflags/include'), # gflags os.path.join(compile_third_party_path, 'install/glog/include'), # glog diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py index 8147fc3d343d6..efe5368cdca56 100644 --- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py +++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py @@ -50,8 +50,8 @@ def build_extensions(self): site_packages_path)) # include path third_party -compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'], - 'build/third_party') +compile_third_party_path = os.path.join(os.environ['PADDLE_BINARY_DIR'], + 'third_party') paddle_custom_kernel_include += [ os.path.join(compile_third_party_path, 'install/gflags/include'), # gflags os.path.join(compile_third_party_path, 'install/glog/include'), # glog diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py index e28bfe00e7c4f..130f74c06d554 100644 --- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py +++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py @@ -31,10 +31,6 @@ def setUp(self): cur_dir, sys.executable) os.system(cmd) - # set environment for loading and registering compiled custom kernels - # only valid in current process - os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir - def test_custom_kernel_dot_run(self): # test dot run x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8) @@ -52,9 +48,6 @@ def test_custom_kernel_dot_run(self): "custom kernel dot out: {},\n numpy dot out: {}".format( out.numpy(), result)) - def tearDown(self): - del os.environ['CUSTOM_DEVICE_ROOT'] - class TestCustomKernelDotC(unittest.TestCase): @@ -67,10 +60,6 @@ def setUp(self): cur_dir, sys.executable) os.system(cmd) - # set environment for loading and registering compiled custom kernels - # only valid in current process - os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir - def test_custom_kernel_dot_run(self): # test dot run x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8) @@ -88,9 +77,6 @@ def test_custom_kernel_dot_run(self): "custom kernel dot out: {},\n numpy dot out: {}".format( out.numpy(), result)) - def tearDown(self): - del os.environ['CUSTOM_DEVICE_ROOT'] - if __name__ == '__main__': if os.name == 'nt' or sys.platform.startswith('darwin'): diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt index fa2ea2726cfab..482dc9cb1f3f6 100644 --- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt @@ -1,4 +1,11 @@ if(WITH_CUSTOM_DEVICE) - py_test(test_custom_cpu_plugin SRCS test_custom_cpu_plugin.py) - set_tests_properties(test_custom_cpu_plugin PROPERTIES TIMEOUT 120) + file( + GLOB TEST_OPS + RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "test_*.py") + string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + + foreach(TEST_OP ${TEST_OPS}) + py_test(${TEST_OP} SRCS ${TEST_OP}.py) + endforeach() endif() diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py index 7da4f38a83686..00d7255a83f21 100644 --- a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py +++ b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py @@ -32,12 +32,15 @@ def setUp(self): os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join( cur_dir, 'PaddleCustomDevice/backends/custom_cpu/build') - def test_custom_device_dataloader(self): + def test_custom_device(self): import paddle with paddle.fluid.framework._test_eager_guard(): self._test_custom_device_dataloader() + self._test_custom_device_mnist() + self._test_eager_backward_api() self._test_custom_device_dataloader() + self._test_custom_device_mnist() def _test_custom_device_dataloader(self): import paddle @@ -60,13 +63,6 @@ def _test_custom_device_dataloader(self): self.assertTrue(label.place.is_custom_place()) break - def test_custom_device_mnist(self): - import paddle - - with paddle.fluid.framework._test_eager_guard(): - self._test_custom_device_mnist() - self._test_custom_device_mnist() - def _test_custom_device_mnist(self): import paddle @@ -120,6 +116,23 @@ def forward(self, inputs, label=None): self.assertTrue(pred.place.is_custom_place()) + def _test_eager_backward_api(self): + x = np.random.random([2, 2]).astype("float32") + y = np.random.random([2, 2]).astype("float32") + grad = np.ones([2, 2]).astype("float32") + + import paddle + paddle.set_device('custom_cpu') + x_tensor = paddle.to_tensor(x, stop_gradient=False) + y_tensor = paddle.to_tensor(y) + z1_tensor = paddle.matmul(x_tensor, y_tensor) + z2_tensor = paddle.matmul(x_tensor, y_tensor) + + grad_tensor = paddle.to_tensor(grad) + paddle.autograd.backward([z1_tensor, z2_tensor], [grad_tensor, None]) + + self.assertTrue(x_tensor.grad.place.is_custom_place()) + def tearDown(self): del os.environ['CUSTOM_DEVICE_ROOT'] From fbedf77e3026b48f521ef8b26c5008d5aa29e247 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Mon, 18 Jul 2022 17:15:48 +0800 Subject: [PATCH 242/250] add ipu support for standalone executor. (#44342) --- .../framework/new_executor/data_transfer.cc | 1 + .../framework/new_executor/interpretercore.cc | 6 ++++ .../new_executor/new_executor_defs.h | 2 +- .../framework/new_executor/stream_analyzer.cc | 5 ++-- paddle/fluid/framework/op_registry.h | 6 ++++ paddle/fluid/operators/memcpy_d2h_op.cc | 28 ++++++++++++++++++ paddle/fluid/operators/memcpy_h2d_op.cc | 29 +++++++++++++++++++ paddle/fluid/operators/memcpy_h2d_op.h | 2 +- paddle/fluid/platform/device_event_base.h | 2 +- python/paddle/fluid/executor.py | 4 +-- 10 files changed, 78 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index b856bbec4b0c4..3cf16266baf08 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -315,6 +315,7 @@ std::shared_ptr TransferDevice(const std::string& var_name, op_type = kMemcpyH2D; int dst_place_type = platform::is_gpu_place(dst_place) ? 0 : platform::is_npu_place(dst_place) ? 1 + : platform::is_ipu_place(dst_place) ? 3 : platform::is_xpu_place(dst_place) ? 2 : -1; attr_map = {{"dst_place_type", dst_place_type}}; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index c321069537c89..3680f0aa900c6 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -25,6 +25,7 @@ #include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/supplement_tracing.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/kernel_context.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -475,8 +476,13 @@ void InterpreterCore::Convert( BuildSkipShareLoDInfo(); for (size_t i = 0; i < vec_instruction_.size(); ++i) { +#ifdef PADDLE_WITH_IPU + gc_event_.emplace_back(phi::CPUPlace(), 0); +#else gc_event_.emplace_back(vec_instruction_[i].DeviceContext().GetPlace(), platform::GenerateDeviceEventFlag()); + +#endif } bool inplaced = false; for (auto inst : vec_instruction_) { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index af3951f4538f1..31e27a07c665d 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -390,7 +390,7 @@ static bool IsCpuOp(const Instruction& instr) { // is supported heterogeneous place static bool IsSupportedHetePlace(const phi::Place& place) { return platform::is_gpu_place(place) || platform::is_npu_place(place) || - platform::is_xpu_place(place); + platform::is_xpu_place(place) || platform::is_ipu_place(place); } } // namespace interpreter diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc index 086dac8dac1fb..760a852baee68 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc @@ -204,8 +204,9 @@ bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr, const Instruction& next_instr) { if (&cur_instr.DeviceContext() == &next_instr.DeviceContext()) return true; - // xpu memcpy kerenl is synchronous. - if (platform::is_xpu_place(place_)) return true; + // xpu&ipu memcpy kerenl is synchronous. + if (platform::is_ipu_place(place_) || platform::is_xpu_place(place_)) + return true; // npu d2h kernel is asynchronous. if (platform::is_npu_place(place_)) { diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index d38efbff3165c..53b77d538b3ed 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -408,6 +408,12 @@ struct OpKernelRegistrarFunctorEx, + ops::MemcpyD2HKernel, + paddle::platform::complex, + ops::MemcpyD2HKernel, + plat::float16, + ops::MemcpyD2HKernel, + int16_t, + ops::MemcpyD2HKernel); +#endif diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc index 98ed68cf84f87..ff7b786d04018 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.cc +++ b/paddle/fluid/operators/memcpy_h2d_op.cc @@ -100,6 +100,7 @@ class MemcpyH2DOpProtoMaker : public framework::OpProtoAndCheckerMaker { "0. CUDAPinnedPlace/CPU <->CUDAPlace" "1. NPUPinnedPlace/CPU <-> NPUPlace" "2. CPU <->XPUPlace" + "3. CPU <->IPUPlace" "Other place type is Unimplemented and will cause ERROR."); AddComment(R"DOC( MemcpyD2H Operator. @@ -233,3 +234,31 @@ REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy_h2d, int16_t, ops::MemcpyH2DKernel); #endif + +#ifdef PADDLE_WITH_IPU +REGISTER_OP_IPU_KERNEL_FUNCTOR(memcpy_h2d, + float, + ops::MemcpyH2DKernel, + double, + ops::MemcpyH2DKernel, + int8_t, + ops::MemcpyH2DKernel, + uint8_t, + ops::MemcpyH2DKernel, + int, + ops::MemcpyH2DKernel, + int64_t, + ops::MemcpyH2DKernel, + bool, + ops::MemcpyH2DKernel, + paddle::platform::bfloat16, + ops::MemcpyH2DKernel, + paddle::platform::complex, + ops::MemcpyH2DKernel, + paddle::platform::complex, + ops::MemcpyH2DKernel, + plat::float16, + ops::MemcpyH2DKernel, + int16_t, + ops::MemcpyH2DKernel); +#endif diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 3fcc4b89eefe8..8cd84f4b59e8c 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -50,7 +50,7 @@ class MemcpyH2DFunctor { lod_tensor.dtype(), phi::Stream(reinterpret_cast(stream))); - if (dst_place_type_ == 0 || dst_place_type_ == 1 || dst_place_type_ == 2) { + if (dst_place_type_ >= 0 && dst_place_type_ <= 3) { framework::TensorCopy( lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor); } else { diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h index 51df0fd4f40ad..a2d3fc1dc3818 100644 --- a/paddle/fluid/platform/device_event_base.h +++ b/paddle/fluid/platform/device_event_base.h @@ -64,7 +64,7 @@ class DeviceEvent { "Required type < %d, but received type = %d", MaxDeviceTypes, type_id_)); - // TODO(Aurelius84): only support CPU/CUDA, need consider XPU/NPU later + // TODO(Aurelius84): only support CPU/CUDA/XPU/NPU. PADDLE_ENFORCE_LT(type_id_, 4, platform::errors::Unavailable( diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index cf00075edcf86..c7bfd19e5a9d0 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1388,8 +1388,8 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name, program = pruned_program def _can_use_interpreter_core(program, place): - if core.is_compiled_with_mlu() or core.is_compiled_with_ipu( - ) or isinstance(place, core.CustomPlace): + if core.is_compiled_with_mlu() or isinstance( + place, core.CustomPlace): return False compiled = isinstance(program, compiler.CompiledProgram) From 4c1e77d1f4fc8675b543be138be7d4df443c1baa Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Mon, 18 Jul 2022 17:39:24 +0800 Subject: [PATCH 243/250] fix typos in template for codegen of operators (#44364) --- paddle/phi/api/yaml/generator/templates/op.c.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/api/yaml/generator/templates/op.c.j2 b/paddle/phi/api/yaml/generator/templates/op.c.j2 index 7f13eb9582589..0c2708ce223c7 100644 --- a/paddle/phi/api/yaml/generator/templates/op.c.j2 +++ b/paddle/phi/api/yaml/generator/templates/op.c.j2 @@ -11,7 +11,7 @@ #include "paddle/phi/infermeta/binary.h" #include "paddle/phi/infermeta/ternary.h" #include "paddle/phi/infermeta/multiary.h" -#include "paddle/phi/infermeta/backward.cc" +#include "paddle/phi/infermeta/backward.h" namespace paddle { namespace operators { From 1d12832669460a7a6f822e7b454010c0918271fe Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Mon, 18 Jul 2022 18:33:23 +0800 Subject: [PATCH 244/250] fix duplicate slice logic in _grad (#44396) --- .../autograd/test_autograd_functional_dynamic.py | 5 +++++ python/paddle/incubate/autograd/functional.py | 10 ++++++---- python/paddle/incubate/autograd/primapi.py | 11 +++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py index 4b61580452592..5eda21eb4c14b 100644 --- a/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py +++ b/python/paddle/fluid/tests/unittests/autograd/test_autograd_functional_dynamic.py @@ -223,6 +223,11 @@ def test_all_cases(self): self.func_vjp_nested() self.func_vjp_aliased_input() + def test_input_single_tensor(self): + self.assertIsInstance( + paddle.incubate.autograd.vjp(paddle.tanh, paddle.rand((3, 4)))[1], + paddle.fluid.framework.Variable) + @utils.place(config.DEVICES) @utils.parameterize( diff --git a/python/paddle/incubate/autograd/functional.py b/python/paddle/incubate/autograd/functional.py index 6c740005f8253..3be95c88d12e7 100644 --- a/python/paddle/incubate/autograd/functional.py +++ b/python/paddle/incubate/autograd/functional.py @@ -565,13 +565,15 @@ def _grad(ys, xs, v=None): inputs. """ if paddle.fluid._non_static_mode(): + # paddle.grad returns a list though the inputs is a signle Tensor. The + # follow code snippet fixes the problem by return the first element of + # xs_grad when the xs is a signle Tensor. xs_grad = paddle.grad(ys, xs, v, create_graph=True, allow_unused=True) + if isinstance(xs, paddle.fluid.framework.Variable) and isinstance( + xs_grad, typing.Sequence) and len(xs_grad) > 0: + xs_grad = xs_grad[0] else: xs_grad = paddle.incubate.autograd.grad(ys, xs, v) - - if isinstance(xs, paddle.fluid.framework.Variable): - xs_grad = xs_grad[0] - return _replace_none_with_zero_tensor(xs_grad, xs) diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py index 5b3ad0dd78a3b..a319874e25c8a 100644 --- a/python/paddle/incubate/autograd/primapi.py +++ b/python/paddle/incubate/autograd/primapi.py @@ -132,9 +132,16 @@ def grad(outputs, inputs, grad_outputs=None): paddle.incubate.autograd.disable_prim() paddle.disable_static() """ - if not utils.prim_enabled(): - return backward.gradients(outputs, inputs, grad_outputs) + grad_inputs = backward.gradients(outputs, inputs, grad_outputs) + # backward.gradients returns a list though the inputs is a signle Tensor. + # The follow code snippet fixes the problem by return the first element + # of grad_inputs when the inputs is a signle Tensor. + if isinstance(inputs, framework.Variable) and isinstance( + grad_inputs, typing.Sequence) and len(grad_inputs) > 0: + return grad_inputs[0] + else: + return grad_inputs if not isinstance(outputs, (framework.Variable, typing.Sequence)): raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], ' From b2224e6f391f3c400a754deaf6af32f722f1a7a0 Mon Sep 17 00:00:00 2001 From: Chenxiao Niu Date: Mon, 18 Jul 2022 19:02:18 +0800 Subject: [PATCH 245/250] [MLU] fix mlu ctest final. (#44404) --- .../fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh | 3 +-- .../fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py | 6 +++--- python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py | 7 ++++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh index 97f21798c1154..36fc85ba6da07 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh +++ b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh @@ -17,5 +17,4 @@ set -e # use default values # FIXME: random fails on Unknown command lines -c (or -m). -launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py -MLU_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op_mlu.py +MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch c_comm_init_op_mlu.py diff --git a/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py index 89475eb698533..1f12d47da42a2 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_hard_swish_op_mlu.py @@ -16,13 +16,13 @@ import paddle.nn.functional as F import paddle.fluid as fluid import paddle +import sys + +sys.path.append("..") from op_test import OpTest import numpy as np import unittest -import sys - -sys.path.append("..") paddle.enable_static() SEED = 2021 diff --git a/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py index e52b5ee301c5a..fc1d62bfdad5d 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_spawn_mlu.py @@ -102,12 +102,13 @@ def test_get_default_nprocs(self): self.assertEqual(nprocs, core.get_mlu_device_count()) def test_spawn(self): - context = dist.spawn(train, backend='cncl', nprocs=4) + num_devs = core.get_mlu_device_count() + context = dist.spawn(train, backend='cncl', nprocs=num_devs) rank_list = [] - for i in range(4): + for i in range(num_devs): rank_list.append(context.return_queues[i].get()) rank_list.sort() - self.assertEqual(rank_list, list(range(4))) + self.assertEqual(rank_list, list(range(num_devs))) if __name__ == '__main__': From c6bf88127e2e53f144165496e44204b7d8fd80e2 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Mon, 18 Jul 2022 20:05:26 +0800 Subject: [PATCH 246/250] fix data transform bug of interpolate op (#44401) --- .../kernels/cpu/interpolate_grad_kernel.cc | 25 +++++++++++++++---- paddle/phi/kernels/cpu/interpolate_kernel.cc | 25 +++++++++++++++---- .../kernels/gpu/interpolate_grad_kernel.cu | 25 +++++++++++++++---- paddle/phi/kernels/gpu/interpolate_kernel.cu | 25 +++++++++++++++---- 4 files changed, 80 insertions(+), 20 deletions(-) diff --git a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc index edd41b2c7a31d..dee6e9149ca2d 100644 --- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc @@ -1041,28 +1041,43 @@ PD_REGISTER_KERNEL(bilinear_interp_v2_grad, ALL_LAYOUT, phi::BilinearInterpGradKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(nearest_interp_v2_grad, CPU, ALL_LAYOUT, phi::NearestInterpGradKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(trilinear_interp_v2_grad, CPU, ALL_LAYOUT, phi::TrilinearInterpGradKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(linear_interp_v2_grad, CPU, ALL_LAYOUT, phi::LinearInterpGradKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(bicubic_interp_v2_grad, CPU, ALL_LAYOUT, phi::BicubicInterpGradKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/kernels/cpu/interpolate_kernel.cc b/paddle/phi/kernels/cpu/interpolate_kernel.cc index 5259a770568e4..3649185a0c7ee 100644 --- a/paddle/phi/kernels/cpu/interpolate_kernel.cc +++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc @@ -1193,7 +1193,10 @@ PD_REGISTER_KERNEL(bilinear_interp_v2, phi::BilinearInterpKernel, float, double, - uint8_t) {} + uint8_t) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(nearest_interp_v2, CPU, ALL_LAYOUT, @@ -1202,24 +1205,36 @@ PD_REGISTER_KERNEL(nearest_interp_v2, double, int, int64_t, - uint8_t) {} + uint8_t) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(trilinear_interp_v2, CPU, ALL_LAYOUT, phi::TrilinearInterpKernel, float, double, - uint8_t) {} + uint8_t) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(linear_interp_v2, CPU, ALL_LAYOUT, phi::LinearInterpKernel, float, double, - uint8_t) {} + uint8_t) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(bicubic_interp_v2, CPU, ALL_LAYOUT, phi::BicubicInterpKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu index 175f09fccfa30..047b4ff69a784 100644 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu @@ -1574,28 +1574,43 @@ PD_REGISTER_KERNEL(bilinear_interp_v2_grad, ALL_LAYOUT, phi::BilinearInterpGradKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(nearest_interp_v2_grad, GPU, ALL_LAYOUT, phi::NearestInterpGradKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(trilinear_interp_v2_grad, GPU, ALL_LAYOUT, phi::TrilinearInterpGradKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(linear_interp_v2_grad, GPU, ALL_LAYOUT, phi::LinearInterpGradKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(bicubic_interp_v2_grad, GPU, ALL_LAYOUT, phi::BicubicInterpGradKernel, float, - double) {} + double) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/kernels/gpu/interpolate_kernel.cu b/paddle/phi/kernels/gpu/interpolate_kernel.cu index 7bc331c52a015..c05514236e091 100644 --- a/paddle/phi/kernels/gpu/interpolate_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu @@ -1446,7 +1446,10 @@ PD_REGISTER_KERNEL(bilinear_interp_v2, phi::BilinearInterpKernel, float, double, - int) {} + int) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(nearest_interp_v2, GPU, ALL_LAYOUT, @@ -1454,25 +1457,37 @@ PD_REGISTER_KERNEL(nearest_interp_v2, float, double, int, - int64_t) {} + int64_t) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(trilinear_interp_v2, GPU, ALL_LAYOUT, phi::TrilinearInterpKernel, float, double, - int) {} + int) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(linear_interp_v2, GPU, ALL_LAYOUT, phi::LinearInterpKernel, float, double, - int) {} + int) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} PD_REGISTER_KERNEL(bicubic_interp_v2, GPU, ALL_LAYOUT, phi::BicubicInterpKernel, float, double, - int) {} + int) { + kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND); + kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND); +} From 3f70b1d3c6038827fbdf336f14e9a0c0c9beb652 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Mon, 18 Jul 2022 20:35:06 +0800 Subject: [PATCH 247/250] [Sparse] Add sparse matmul kernel(coo*dense->dense) (#44346) --- paddle/fluid/platform/dynload/cusparse.cc | 4 + paddle/phi/api/yaml/sparse_api.yaml | 10 +- paddle/phi/api/yaml/sparse_bw_api.yaml | 7 +- paddle/phi/backends/dynload/cusparse.cc | 4 + .../funcs/sparse/sparse_blas_impl.cu.h | 3 + .../kernels/sparse/cpu/matmul_grad_kernel.cc | 12 +- .../phi/kernels/sparse/cpu/matmul_kernel.cc | 12 +- paddle/phi/kernels/sparse/empty_kernel.cc | 24 +- .../kernels/sparse/gpu/matmul_grad_kernel.cu | 60 ++++- .../phi/kernels/sparse/gpu/matmul_kernel.cu | 45 +++- .../kernels/sparse/impl/unary_kernel_impl.h | 6 +- .../phi/kernels/sparse/matmul_grad_kernel.h | 12 +- paddle/phi/kernels/sparse/matmul_kernel.h | 12 +- .../tests/unittests/test_sparse_matmul_op.py | 244 ++++++++---------- python/paddle/incubate/sparse/binary.py | 89 ++++--- 15 files changed, 293 insertions(+), 251 deletions(-) diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc index da93455e8bc7d..756737c1a169f 100644 --- a/paddle/fluid/platform/dynload/cusparse.cc +++ b/paddle/fluid/platform/dynload/cusparse.cc @@ -28,6 +28,10 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP); CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP); #endif +#ifdef CUSPARSE_ROUTINE_EACH_R3 +CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP); +#endif + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_api.yaml index e32ce5b21540b..e816824b82f72 100644 --- a/paddle/phi/api/yaml/sparse_api.yaml +++ b/paddle/phi/api/yaml/sparse_api.yaml @@ -297,7 +297,7 @@ args : (Tensor x, Tensor y, Tensor mask) output : Tensor(out) kernel : - func : csr_masked_matmul{dense, dense, sparse_csr -> sparse_csr} + func : masked_matmul_csr{dense, dense, sparse_csr -> sparse_csr} layout : x backward: masked_matmul_grad @@ -305,10 +305,10 @@ args : (Tensor x, Tensor y) output : Tensor(out) kernel : - func : csr_dense_matmul{sparse_csr, dense -> dense}, - csr_csr_matmul{sparse_csr, sparse_csr -> sparse_csr}, - coo_dense_matmul{sparse_coo, dense -> dense}, - coo_coo_matmul{sparse_coo, sparse_coo -> sparse_coo} + func : matmul_csr_dense {sparse_csr, dense -> dense}, + matmul_csr_csr {sparse_csr, sparse_csr -> sparse_csr}, + matmul_coo_dense {sparse_coo, dense -> dense}, + matmul_coo_coo {sparse_coo, sparse_coo -> sparse_coo} layout : x backward: matmul_grad diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_bw_api.yaml index 6e3a82a22bcfc..68e6020ac3626 100644 --- a/paddle/phi/api/yaml/sparse_bw_api.yaml +++ b/paddle/phi/api/yaml/sparse_bw_api.yaml @@ -125,14 +125,17 @@ args : (Tensor x, Tensor y, Tensor out_grad) output : Tensor(x_grad), Tensor(y_grad) kernel : - func : csr_masked_matmul_grad{dense, dense, sparse_csr -> dense, dense} + func : masked_matmul_csr_grad{dense, dense, sparse_csr -> dense, dense} - backward_api : matmul_grad forward : matmul(Tensor x, Tensor y) -> Tensor(out) args : (Tensor x, Tensor y, Tensor out_grad) output : Tensor(x_grad), Tensor(y_grad) kernel : - func : csr_dense_matmul_grad{sparse_csr, dense, dense -> sparse_csr, dense} + func : matmul_csr_dense_grad {sparse_csr, dense, dense -> sparse_csr, dense}, + matmul_csr_csr_grad {sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}, + matmul_coo_dense_grad {sparse_coo, dense, dense -> sparse_coo, dense}, + matmul_coo_coo_grad {sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo} - backward_api : multiply_grad forward : multiply(Tensor x, Tensor y) -> Tensor(out) diff --git a/paddle/phi/backends/dynload/cusparse.cc b/paddle/phi/backends/dynload/cusparse.cc index 013211064b8e4..ce8f87dc3cdfa 100644 --- a/paddle/phi/backends/dynload/cusparse.cc +++ b/paddle/phi/backends/dynload/cusparse.cc @@ -30,5 +30,9 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP); CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP); #endif +#ifdef CUSPARSE_ROUTINE_EACH_R3 +CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP); +#endif + } // namespace dynload } // namespace phi diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 3d92674c92d6e..9f7be26857bdb 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -298,6 +298,7 @@ class CuSparseDnVecDescriptor { cusparseDnVecDescr_t descriptor_; }; +/************* SPARSE*DENSE->DENSE MATMUL ************/ template <> template void SparseBlas::SPMM(bool transa, @@ -345,6 +346,7 @@ void SparseBlas::SPMM(bool transa, }); } +/************* SPARSE*DENSE->DENSE MV ************/ template <> template void SparseBlas::SPMV(bool transa, @@ -389,6 +391,7 @@ void SparseBlas::SPMV(bool transa, }); } +/************* DENSE*DENSE->SPARSE MATMUL ************/ #if CUDA_VERSION >= 11030 template <> template diff --git a/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc index cd1665b66431b..2586976b7636c 100644 --- a/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc @@ -22,7 +22,7 @@ namespace sparse { // TODO(zhouwei25): implement CPU backward kernel of " CSR @ DENSE -> DENSE" template -void CsrDenseMatmulGradKernel(const Context& dev_ctx, +void MatmulCsrDenseGradKernel(const Context& dev_ctx, const SparseCsrTensor& x, const DenseTensor& y, const DenseTensor& dout, @@ -34,7 +34,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx, // TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR" template -void CsrMaskedMatmulGradKernel(const Context& dev_ctx, +void MaskedMatmulCsrGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, const SparseCsrTensor& dout, @@ -47,18 +47,18 @@ void CsrMaskedMatmulGradKernel(const Context& dev_ctx, } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(csr_dense_matmul_grad, +PD_REGISTER_KERNEL(matmul_csr_dense_grad, CPU, ALL_LAYOUT, - phi::sparse::CsrDenseMatmulGradKernel, + phi::sparse::MatmulCsrDenseGradKernel, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); } -PD_REGISTER_KERNEL(csr_masked_matmul_grad, +PD_REGISTER_KERNEL(masked_matmul_csr_grad, CPU, ALL_LAYOUT, - phi::sparse::CsrMaskedMatmulGradKernel, + phi::sparse::MaskedMatmulCsrGradKernel, float, double) {} diff --git a/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc b/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc index 0818b8e900a05..8db0ccfd575e5 100644 --- a/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc +++ b/paddle/phi/kernels/sparse/cpu/matmul_kernel.cc @@ -22,7 +22,7 @@ namespace sparse { // TODO(zhouwei25): implement CPU kernel of " CSR @ DENSE -> DENSE" template -void CsrDenseMatmulKernel(const Context& dev_ctx, +void MatmulCsrDenseKernel(const Context& dev_ctx, const SparseCsrTensor& x, const DenseTensor& y, DenseTensor* out) { @@ -32,7 +32,7 @@ void CsrDenseMatmulKernel(const Context& dev_ctx, // TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR" template -void CsrMaskedMatmulKernel(const Context& dev_ctx, +void MaskedMatmulCsrKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, const SparseCsrTensor& mask, @@ -44,18 +44,18 @@ void CsrMaskedMatmulKernel(const Context& dev_ctx, } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(csr_dense_matmul, +PD_REGISTER_KERNEL(matmul_csr_dense, CPU, ALL_LAYOUT, - phi::sparse::CsrDenseMatmulKernel, + phi::sparse::MatmulCsrDenseKernel, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); } -PD_REGISTER_KERNEL(csr_masked_matmul, +PD_REGISTER_KERNEL(masked_matmul_csr, CPU, ALL_LAYOUT, - phi::sparse::CsrMaskedMatmulKernel, + phi::sparse::MaskedMatmulCsrKernel, float, double) {} diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc index c1706b9919d90..115611a272d94 100644 --- a/paddle/phi/kernels/sparse/empty_kernel.cc +++ b/paddle/phi/kernels/sparse/empty_kernel.cc @@ -26,37 +26,27 @@ template void EmptyLikeCooKernel(const Context& dev_ctx, const SparseCooTensor& x, SparseCooTensor* out) { - const DenseTensor& x_indices = x.non_zero_indices(); + out->set_dims(x.dims()); + *(out->mutable_non_zero_indices()) = x.non_zero_indices(); + const DenseTensor& x_values = x.non_zero_elements(); - DenseTensor* out_indices = out->mutable_non_zero_indices(); DenseTensor* out_values = out->mutable_non_zero_elements(); - - phi::Copy(dev_ctx, x_indices, dev_ctx.GetPlace(), false, out_indices); - out_values->Resize(x_values.dims()); dev_ctx.template Alloc(out_values); - - out->set_dims(x.dims()); } template void EmptyLikeCsrKernel(const Context& dev_ctx, const SparseCsrTensor& x, SparseCsrTensor* out) { - const DenseTensor& x_crows = x.non_zero_crows(); - const DenseTensor& x_cols = x.non_zero_cols(); + out->set_dims(x.dims()); + *(out->mutable_non_zero_crows()) = x.non_zero_crows(); + *(out->mutable_non_zero_cols()) = x.non_zero_cols(); + const DenseTensor& x_values = x.non_zero_elements(); - DenseTensor* out_crows = out->mutable_non_zero_crows(); - DenseTensor* out_cols = out->mutable_non_zero_cols(); DenseTensor* out_values = out->mutable_non_zero_elements(); - - phi::Copy(dev_ctx, x_crows, dev_ctx.GetPlace(), false, out_crows); - phi::Copy(dev_ctx, x_cols, dev_ctx.GetPlace(), false, out_cols); - out_values->Resize(x_values.dims()); dev_ctx.template Alloc(out_values); - - out->set_dims(x.dims()); } } // namespace sparse diff --git a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu index d5c128fea6f29..c4bb66827e35a 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu @@ -22,13 +22,52 @@ limitations under the License. */ #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/sparse/sparse_blas.h" #include "paddle/phi/kernels/sparse/empty_kernel.h" +#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h" #include "paddle/phi/kernels/transpose_kernel.h" namespace phi { namespace sparse { template -void CsrDenseMatmulGradKernel(const Context& dev_ctx, +void MatmulCooDenseGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& y, + const DenseTensor& dout, + SparseCooTensor* dx, + DenseTensor* dy) { +#if CUDA_VERSION >= 11030 + auto sparse_blas = phi::funcs::sparse::GetSparseBlas(dev_ctx); + + // dx{SparseCoo} = dout{Dense} * y'{Dense} + if (dx) { + // 'cusparseSDDMM' only support CSR now, so use COO->CSR->COO, + // which will increase some expenses. + EmptyLikeCooKernel(dev_ctx, x, dx); + SparseCsrTensor dx_csr = SparseCooToCsr(dev_ctx, *dx); + sparse_blas.SDDMM( + false, true, static_cast(1), dout, y, static_cast(0), &dx_csr); + SparseCsrToCooKernel(dev_ctx, dx_csr, dx); + } + + // dy{Dense} = x'{SparseCoo} * dout{Dense} + if (dy) { + MetaTensor meta_dy(dy); + meta_dy.set_dims(y.dims()); + meta_dy.set_dtype(y.dtype()); + dev_ctx.template Alloc(dy); + + sparse_blas.SPMM( + true, false, static_cast(1), x, dout, static_cast(0), dy); + } +#else + PADDLE_THROW(phi::errors::Unimplemented( + "backward of 'sparse.matmul' use cusparseSDDMM, which is supported from " + "CUDA 11.3")); +#endif +} + +template +void MatmulCsrDenseGradKernel(const Context& dev_ctx, const SparseCsrTensor& x, const DenseTensor& y, const DenseTensor& dout, @@ -66,7 +105,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx, } template -void CsrMaskedMatmulGradKernel(const Context& dev_ctx, +void MaskedMatmulCsrGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, const SparseCsrTensor& dout, @@ -119,18 +158,27 @@ void CsrMaskedMatmulGradKernel(const Context& dev_ctx, } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(csr_dense_matmul_grad, +PD_REGISTER_KERNEL(matmul_coo_dense_grad, + GPU, + ALL_LAYOUT, + phi::sparse::MatmulCooDenseGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} + +PD_REGISTER_KERNEL(matmul_csr_dense_grad, GPU, ALL_LAYOUT, - phi::sparse::CsrDenseMatmulGradKernel, + phi::sparse::MatmulCsrDenseGradKernel, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); } -PD_REGISTER_KERNEL(csr_masked_matmul_grad, +PD_REGISTER_KERNEL(masked_matmul_csr_grad, GPU, ALL_LAYOUT, - phi::sparse::CsrMaskedMatmulGradKernel, + phi::sparse::MaskedMatmulCsrGradKernel, float, double) {} diff --git a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu index 69cd4bac0c763..3adbce0dd17df 100644 --- a/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu +++ b/paddle/phi/kernels/sparse/gpu/matmul_kernel.cu @@ -31,11 +31,11 @@ limitations under the License. */ namespace phi { namespace sparse { -template -void CsrDenseMatmulKernel(const Context& dev_ctx, - const SparseCsrTensor& x, - const DenseTensor& y, - DenseTensor* out) { +template +void MatmulKernelImpl(const Context& dev_ctx, + const TensorType& x, + const DenseTensor& y, + DenseTensor* out) { #if CUDA_VERSION >= 11000 std::vector xdim_vec = phi::vectorize(x.dims()); std::vector ydim_vec = phi::vectorize(y.dims()); @@ -91,7 +91,23 @@ void CsrDenseMatmulKernel(const Context& dev_ctx, } template -void CsrMaskedMatmulKernel(const Context& dev_ctx, +void MatmulCooDenseKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& y, + DenseTensor* out) { + MatmulKernelImpl(dev_ctx, x, y, out); +} + +template +void MatmulCsrDenseKernel(const Context& dev_ctx, + const SparseCsrTensor& x, + const DenseTensor& y, + DenseTensor* out) { + MatmulKernelImpl(dev_ctx, x, y, out); +} + +template +void MaskedMatmulCsrKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, const SparseCsrTensor& mask, @@ -176,18 +192,27 @@ void CsrMaskedMatmulKernel(const Context& dev_ctx, } // namespace sparse } // namespace phi -PD_REGISTER_KERNEL(csr_dense_matmul, +PD_REGISTER_KERNEL(matmul_csr_dense, GPU, ALL_LAYOUT, - phi::sparse::CsrDenseMatmulKernel, + phi::sparse::MatmulCsrDenseKernel, float, double) { kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR); } -PD_REGISTER_KERNEL(csr_masked_matmul, +PD_REGISTER_KERNEL(matmul_coo_dense, + GPU, + ALL_LAYOUT, + phi::sparse::MatmulCooDenseKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); +} + +PD_REGISTER_KERNEL(masked_matmul_csr, GPU, ALL_LAYOUT, - phi::sparse::CsrMaskedMatmulKernel, + phi::sparse::MaskedMatmulCsrKernel, float, double) {} diff --git a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h index 231fc551f4788..2639753266db6 100644 --- a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h +++ b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h @@ -134,7 +134,7 @@ void CastCooKernel(const Context& dev_ctx, DenseTensor* out_values = out->mutable_non_zero_elements(); if (index_dtype == DataType::UNDEFINED) { - phi::Copy(dev_ctx, x_indices, dev_ctx.GetPlace(), false, out_indices); + *out_indices = x_indices; } else { phi::MetaTensor meta(out_indices); meta.set_dims(x_indices.dims()); @@ -172,8 +172,8 @@ void CastCsrKernel(const Context& dev_ctx, DenseTensor* out_values = out->mutable_non_zero_elements(); if (index_dtype == DataType::UNDEFINED) { - phi::Copy(dev_ctx, x_crows, dev_ctx.GetPlace(), false, out_crows); - phi::Copy(dev_ctx, x_cols, dev_ctx.GetPlace(), false, out_cols); + *out_crows = x_crows; + *out_cols = x_cols; } else { phi::MetaTensor crows_meta(out_crows); crows_meta.set_dims(x_crows.dims()); diff --git a/paddle/phi/kernels/sparse/matmul_grad_kernel.h b/paddle/phi/kernels/sparse/matmul_grad_kernel.h index 787691f3515d6..4acb7bb7e1eb5 100644 --- a/paddle/phi/kernels/sparse/matmul_grad_kernel.h +++ b/paddle/phi/kernels/sparse/matmul_grad_kernel.h @@ -23,16 +23,16 @@ namespace sparse { // TODO(zhouwei25): implement Backward of " COO @ COO -> COO" template -void CooCooMatmulGradKernel(const Context& dev_ctx, +void MatmulCooCooGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const SparseCooTensor& y, const SparseCooTensor& dout, SparseCooTensor* dx, SparseCooTensor* dy); -// TODO(zhouwei25): implement Backward of " COO @ DENSE -> DENSE" +// Backward of " COO @ DENSE -> DENSE" template -void CooDenseMatmulGradKernel(const Context& dev_ctx, +void MatmulCooDenseGradKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& y, const DenseTensor& dout, @@ -41,7 +41,7 @@ void CooDenseMatmulGradKernel(const Context& dev_ctx, // TODO(zhouwei25): implement Backward of " CSR @ CSR -> CSR" template -void CsrCsrMatmulGradKernel(const Context& dev_ctx, +void MatmulCsrCsrGradKernel(const Context& dev_ctx, const SparseCsrTensor& x, const SparseCsrTensor& y, const SparseCsrTensor& dout, @@ -50,7 +50,7 @@ void CsrCsrMatmulGradKernel(const Context& dev_ctx, /* Backward of "CSR @ DENSE -> DENSE" */ template -void CsrDenseMatmulGradKernel(const Context& dev_ctx, +void MatmulCsrDenseGradKernel(const Context& dev_ctx, const SparseCsrTensor& x, const DenseTensor& y, const DenseTensor& dout, @@ -59,7 +59,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx, /* Backward of "DENSE @ DENSE * CSR_MASK -> CSR" */ template -void CsrMaskedMatmulGradKernel(const Context& dev_ctx, +void MaskedMatmulCsrGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, const SparseCsrTensor& dout, diff --git a/paddle/phi/kernels/sparse/matmul_kernel.h b/paddle/phi/kernels/sparse/matmul_kernel.h index d9093a020c207..a261bbf3cd3f7 100644 --- a/paddle/phi/kernels/sparse/matmul_kernel.h +++ b/paddle/phi/kernels/sparse/matmul_kernel.h @@ -23,35 +23,35 @@ namespace sparse { // TODO(zhouwei25): implement " COO @ COO -> COO" template -void CooCooMatmulKernel(const Context& dev_ctx, +void MatmulCooCooKernel(const Context& dev_ctx, const SparseCooTensor& x, const SparseCooTensor& y, SparseCooTensor* out); -// TODO(zhouwei25): implement " COO @ DENSE -> DENSE" +/* COO @ DENSE -> DENSE */ template -void CooDenseMatmulKernel(const Context& dev_ctx, +void MatmulCooDenseKernel(const Context& dev_ctx, const SparseCooTensor& x, const DenseTensor& y, DenseTensor* out); // TODO(zhouwei25): implement " CSR @ CSR -> CSR" template -void CsrCsrMatmulKernel(const Context& dev_ctx, +void MatmulCsrCsrKernel(const Context& dev_ctx, const SparseCsrTensor& x, const SparseCsrTensor& y, SparseCsrTensor* out); /* CSR @ DENSE -> DENSE */ template -void CsrDenseMatmulKernel(const Context& dev_ctx, +void MatmulCsrDenseKernel(const Context& dev_ctx, const SparseCsrTensor& x, const DenseTensor& y, DenseTensor* out); /* DENSE @ DENSE * CSR_MASK -> CSR */ template -void CsrMaskedMatmulKernel(const Context& dev_ctx, +void MaskedMatmulCsrKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, const SparseCsrTensor& mask, diff --git a/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py b/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py index 96adf959b2b6e..8986d4a7ef5d2 100644 --- a/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py +++ b/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py @@ -13,8 +13,6 @@ # limitations under the License. import paddle -from paddle.fluid.framework import _test_eager_guard - import numpy as np import scipy import scipy.sparse as sp @@ -22,7 +20,7 @@ import os import re -np.random.seed(2022) +paddle.set_default_dtype('float64') def get_cuda_version(): @@ -37,153 +35,115 @@ def get_cuda_version(): return -1 -@unittest.skipIf( - not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000, - "paddle is not compiled with CUDA and cuda version need to >= 11.0") -class TestCsrDenseMatmul2D(unittest.TestCase): - # x: csr, y: dense, out: dense - def test_matmul(self): - with _test_eager_guard(): - mask = np.random.rand(10, 12) < 0.2 - np_x = np.random.rand(10, 12) * mask - - np_csr = sp.csr_matrix(np_x) - np_dense = np.random.rand(12, 6) - np_out = np_csr @ np_dense - - np_out_grad = np.ones([10, 6]) - - # dx(csr) = dout(dense) * y'(dense) * mask - np_csr_grad = sp.csr_matrix( - np.matmul(np_out_grad, np_dense.transpose(1, 0)) * mask) - # dy(dense) = x'(csr) * dout(dense) - np_dense_grad = np_csr.transpose() @ np_out_grad - - csr = paddle.to_tensor(np_x, stop_gradient=False).to_sparse_csr() - dense = paddle.to_tensor(np_dense, stop_gradient=False) - out = paddle.incubate.sparse.matmul(csr, dense) - - self.assertTrue(np.allclose(np_out, out.numpy())) - - if get_cuda_version() >= 11030: - out.backward() - self.assertTrue( - np.allclose(np_csr_grad.indptr, - csr.grad.crows().numpy())) - self.assertTrue( - np.allclose(np_csr_grad.indices, - csr.grad.cols().numpy())) - self.assertTrue( - np.allclose(np_csr_grad.data, - csr.grad.values().numpy())) - - self.assertTrue(np.allclose(np_dense_grad, dense.grad.numpy())) - - -@unittest.skipIf( - not paddle.is_compiled_with_cuda() or get_cuda_version() < 11030, - "paddle is not compiled with CUDA and cuda version need to >= 11.3") -class TestCsrMaskedMatmul2D(unittest.TestCase): - # x: dense, y: dense, out: csr - def test_matmul(self): - with _test_eager_guard(): - np_mask = np.random.rand(10, 6) < 0.2 - - np_x = np.random.rand(10, 12) - np_y = np.random.rand(12, 6) - np_out = sp.csr_matrix(np.matmul(np_x, np_y) * np_mask) - - np_out_grad = sp.csr_matrix(np.ones([10, 6]) * np_mask) - # dx(dense) = dout(csr) * y'(dense) - np_x_grad = np_out_grad @ np_y.transpose(1, 0) - # dy(dense) = x'(dense) * dout(csr) -> dy'(dense) = dout'(csr) * x(dense) - np_y_grad = (np_out_grad.transpose() @ np_x).transpose(1, 0) - - x = paddle.to_tensor(np_x, stop_gradient=False) - y = paddle.to_tensor(np_y, stop_gradient=False) - mask = paddle.to_tensor(np.ones([10, 6]) * np_mask).to_sparse_csr() - out = paddle.incubate.sparse.masked_matmul(x, y, mask) - - self.assertTrue(np.allclose(np_out.indptr, out.crows().numpy())) - self.assertTrue(np.allclose(np_out.indices, out.cols().numpy())) - self.assertTrue(np.allclose(np_out.data, out.values().numpy())) - - out.backward() - self.assertTrue(np.allclose(out.is_sparse_csr(), True)) - self.assertTrue(np.allclose(np_x_grad, x.grad.numpy())) - self.assertTrue(np.allclose(np_y_grad, y.grad.numpy())) - - -@unittest.skipIf( - not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070, - "paddle is not compiled with CUDA and cuda version need to >= 11.7") -class TestCsrDenseMatmul3D(unittest.TestCase): - # x: csr, y: dense, out: dense - def test_matmul(self): - with _test_eager_guard(): - paddle.set_default_dtype('float32') - origin_x = paddle.rand([16, 16, 12]) - mask = paddle.randint(0, 2, [16, 12]) - origin_x = origin_x * mask - origin_y = paddle.rand([16, 12, 10]) - - dense_x = origin_x.detach() - dense_x.stop_gradient = False - dense_y = origin_y.detach() - dense_y.stop_gradient = False - dense_out = paddle.matmul(dense_x, dense_y) - dense_out.backward() - +class TestMatmul(unittest.TestCase): + # x: sparse, y: dense, out: dense + def check_result(self, x_shape, y_shape, format): + if len(x_shape) == 3: + mask = paddle.randint(0, 2, [x_shape[-2], x_shape[-1]]) + else: + mask = paddle.randint(0, 2, x_shape) + origin_x = paddle.rand(x_shape) * mask + origin_y = paddle.rand(y_shape) + + dense_x = origin_x.detach() + dense_x.stop_gradient = False + dense_y = origin_y.detach() + dense_y.stop_gradient = False + dense_out = paddle.matmul(dense_x, dense_y) + + if format == "coo": + sp_x = origin_x.detach().to_sparse_coo(len(x_shape)) + else: sp_x = origin_x.detach().to_sparse_csr() - sp_x.stop_gradient = False - sp_y = origin_y.detach() - sp_y.stop_gradient = False - sp_out = paddle.incubate.sparse.matmul(sp_x, sp_y) - sp_out.backward() + sp_x.stop_gradient = False + sp_y = origin_y.detach() + sp_y.stop_gradient = False + sp_out = paddle.incubate.sparse.matmul(sp_x, sp_y) - self.assertTrue(np.allclose(sp_out.numpy(), dense_out.numpy())) + self.assertTrue(np.allclose(sp_out.numpy(), dense_out.numpy())) + if get_cuda_version() >= 11030: + dense_out.backward() + sp_out.backward() self.assertTrue( np.allclose(sp_x.grad.to_dense().numpy(), (dense_x.grad * mask).numpy())) self.assertTrue(np.allclose(sp_y.grad.numpy(), dense_y.grad.numpy())) - -@unittest.skipIf( - not paddle.is_compiled_with_cuda() or get_cuda_version() < 11070, - "paddle is not compiled with CUDA and cuda version need to >= 11.7") -class TestCsrMaskedMatmul3D(unittest.TestCase): - # x: dense, y: dense, out: csr - def test_matmul(self): - with _test_eager_guard(): - paddle.set_default_dtype('float64') - origin_x = paddle.rand([16, 16, 12]) - origin_y = paddle.rand([16, 12, 10]) - - mask = paddle.randint(0, 2, [16, 10]) - - dense_x = origin_x.detach() - dense_x.stop_gradient = False - dense_y = origin_y.detach() - dense_y.stop_gradient = False - dense_out = paddle.matmul(dense_x, dense_y) - dense_out = dense_out * mask - dense_out.backward() - - sp_x = origin_x.detach() - sp_x.stop_gradient = False - sp_y = origin_y.detach() - sp_y.stop_gradient = False - sp_out = paddle.incubate.sparse.masked_matmul( - sp_x, sp_y, dense_out.to_sparse_csr()) - sp_out.backward() - - self.assertTrue( - np.allclose(sp_out.to_dense().numpy(), dense_out.numpy())) - self.assertTrue(np.allclose(sp_x.grad.numpy(), - dense_x.grad.numpy())) - self.assertTrue(np.allclose(sp_y.grad.numpy(), - dense_y.grad.numpy())) + @unittest.skipIf(not paddle.is_compiled_with_cuda() + or get_cuda_version() < 11000, "only support cuda>=11.0") + def test_matmul_2d(self): + self.check_result([16, 12], [12, 10], 'coo') + self.check_result([16, 12], [12, 10], 'csr') + + @unittest.skipIf(not paddle.is_compiled_with_cuda() + or get_cuda_version() < 11070, "only support cuda>=11.7") + def test_matmul_3d(self): + self.check_result([8, 16, 12], [8, 12, 10], 'coo') + self.check_result([8, 16, 12], [8, 12, 10], 'csr') + + +class TestMaskedMatmul(unittest.TestCase): + # x: dense, y: dense, out: sparse_`csr + @unittest.skipIf(not paddle.is_compiled_with_cuda() + or get_cuda_version() < 11030, + "only support on cuda>=11.3") + def test_masked_matmul_2d(self): + np_mask = np.random.rand(10, 6) < 0.2 + + np_x = np.random.rand(10, 12) + np_y = np.random.rand(12, 6) + np_out = sp.csr_matrix(np.matmul(np_x, np_y) * np_mask) + + np_out_grad = sp.csr_matrix(np.ones([10, 6]) * np_mask) + # dx(dense) = dout(csr) * y'(dense) + np_x_grad = np_out_grad @ np_y.transpose(1, 0) + # dy(dense) = x'(dense) * dout(csr) -> dy'(dense) = dout'(csr) * x(dense) + np_y_grad = (np_out_grad.transpose() @ np_x).transpose(1, 0) + + x = paddle.to_tensor(np_x, stop_gradient=False) + y = paddle.to_tensor(np_y, stop_gradient=False) + mask = paddle.to_tensor(np.ones([10, 6]) * np_mask).to_sparse_csr() + out = paddle.incubate.sparse.masked_matmul(x, y, mask) + + self.assertTrue(np.allclose(np_out.indptr, out.crows().numpy())) + self.assertTrue(np.allclose(np_out.indices, out.cols().numpy())) + self.assertTrue(np.allclose(np_out.data, out.values().numpy())) + + out.backward() + self.assertTrue(np.allclose(out.is_sparse_csr(), True)) + self.assertTrue(np.allclose(np_x_grad, x.grad.numpy())) + self.assertTrue(np.allclose(np_y_grad, y.grad.numpy())) + + @unittest.skipIf(not paddle.is_compiled_with_cuda() + or get_cuda_version() < 11070, + "only support on cuda>=11.7") + def test_masked_matmul_3d(self): + paddle.set_default_dtype('float32') + origin_x = paddle.rand([16, 16, 12]) + mask = paddle.randint(0, 2, [16, 12]) + origin_x = origin_x * mask + origin_y = paddle.rand([16, 12, 10]) + + dense_x = origin_x.detach() + dense_x.stop_gradient = False + dense_y = origin_y.detach() + dense_y.stop_gradient = False + dense_out = paddle.matmul(dense_x, dense_y) + dense_out.backward() + + sp_x = origin_x.detach().to_sparse_csr() + sp_x.stop_gradient = False + sp_y = origin_y.detach() + sp_y.stop_gradient = False + sp_out = paddle.incubate.sparse.matmul(sp_x, sp_y) + sp_out.backward() + + self.assertTrue(np.allclose(sp_out.numpy(), dense_out.numpy())) + self.assertTrue( + np.allclose(sp_x.grad.to_dense().numpy(), + (dense_x.grad * mask).numpy())) + self.assertTrue(np.allclose(sp_y.grad.numpy(), dense_y.grad.numpy())) if __name__ == "__main__": diff --git a/python/paddle/incubate/sparse/binary.py b/python/paddle/incubate/sparse/binary.py index 0c90cd92a7537..7a7861f7b20e7 100644 --- a/python/paddle/incubate/sparse/binary.py +++ b/python/paddle/incubate/sparse/binary.py @@ -62,29 +62,37 @@ def matmul(x, y, name=None): .. code-block:: python import paddle - from paddle.fluid.framework import _test_eager_guard - paddle.seed(100) # csr @ dense -> dense - - with _test_eager_guard(): - crows = [0, 2, 3, 5] - cols = [1, 3, 2, 0, 1] - values = [1., 2., 3., 4., 5.] - dense_shape = [3, 4] - csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape) - # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, - # crows=[0, 2, 3, 5], - # cols=[1, 3, 2, 0, 1], - # values=[1., 2., 3., 4., 5.]) - dense = paddle.randn([4, 3]) - - out = paddle.incubate.sparse.matmul(csr, dense) - # Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True, - # [[-1.94294846 , -3.33990622 , 0.62359387 ], - # [-4.12815523 , 3.46535444 , -3.27413893 ], - # [-0.15209436 , -19.23207283, -3.35593438 ]]) - + crows = [0, 1, 2, 3] + cols = [1, 2, 0] + values = [1., 2., 3.] + csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, [3, 3]) + # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, + # crows=[0, 1, 2, 3], + # cols=[1, 2, 0], + # values=[1., 2., 3.]) + dense = paddle.ones([3, 2]) + out = paddle.incubate.sparse.matmul(csr, dense) + # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [[1., 1.], + # [2., 2.], + # [3., 3.]]) + + # coo @ dense -> dense + indices = [[0, 1, 2], [1, 2, 0]] + values = [1., 2., 3.] + coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values, [3, 3]) + # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, + # indices=[[0, 1, 2], + # [1, 2, 0]], + # values=[1., 2., 3.]) + dense = paddle.ones([3, 2]) + out = paddle.incubate.sparse.matmul(coo, dense) + # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True, + # [[1., 1.], + # [2., 2.], + # [3., 3.]]) """ return _C_ops.final_state_sparse_matmul(x, y) @@ -123,30 +131,27 @@ def masked_matmul(x, y, mask, name=None): .. code-block:: python import paddle - from paddle.fluid.framework import _test_eager_guard paddle.seed(100) # dense @ dense * csr_mask -> csr - - with _test_eager_guard(): - crows = [0, 2, 3, 5] - cols = [1, 3, 2, 0, 1] - values = [1., 2., 3., 4., 5.] - dense_shape = [3, 4] - mask = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape) - # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, - # crows=[0, 2, 3, 5], - # cols=[1, 3, 2, 0, 1], - # values=[1., 2., 3., 4., 5.]) - - x = paddle.rand([3, 5]) - y = paddle.rand([5, 4]) - - out = paddle.incubate.sparse.masked_matmul(x, y, mask) - # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, - # crows=[0, 2, 3, 5], - # cols=[1, 3, 2, 0, 1], - # values=[0.98986477, 0.97800624, 1.14591956, 0.68561077, 0.94714981]) + crows = [0, 2, 3, 5] + cols = [1, 3, 2, 0, 1] + values = [1., 2., 3., 4., 5.] + dense_shape = [3, 4] + mask = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape) + # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, + # crows=[0, 2, 3, 5], + # cols=[1, 3, 2, 0, 1], + # values=[1., 2., 3., 4., 5.]) + + x = paddle.rand([3, 5]) + y = paddle.rand([5, 4]) + + out = paddle.incubate.sparse.masked_matmul(x, y, mask) + # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, + # crows=[0, 2, 3, 5], + # cols=[1, 3, 2, 0, 1], + # values=[0.98986477, 0.97800624, 1.14591956, 0.68561077, 0.94714981]) """ return _C_ops.final_state_sparse_masked_matmul(x, y, mask) From dd0a07f23620f7a859c410ed8be3963f0ea6d647 Mon Sep 17 00:00:00 2001 From: Xiaoxu Chen Date: Mon, 18 Jul 2022 21:23:27 +0800 Subject: [PATCH 248/250] fix new autodiff api docs (#44341) --- python/paddle/incubate/autograd/primapi.py | 26 +++++++++++++--------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py index a319874e25c8a..ba7a2537df133 100644 --- a/python/paddle/incubate/autograd/primapi.py +++ b/python/paddle/incubate/autograd/primapi.py @@ -26,14 +26,14 @@ def forward_grad(outputs, inputs, grad_inputs=None): **ONLY available in the static mode and primitive operators.** Args: - outputs: The output tensor or tensors - inputs: The input tensor or tensors - grad_inputs: The gradient Tensor or Tensors of inputs which has - the same shape with inputs, Defaults to None, in this case is - equivalent to all ones . + outputs(Tensor|Sequence[Tensor]): The output tensor or tensors. + inputs(Tensor|Sequence[Tensor]): The input tensor or tensors. + grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or + Tensors of inputs which has the same shape with inputs, Defaults to + None, in this case is equivalent to all ones. Returns: - grad_outputs (Tensor|Sequence[Tensor]): The gradients for outputs. + grad_outputs(Tensor|Sequence[Tensor]): The gradients for outputs. Examples: @@ -99,14 +99,14 @@ def grad(outputs, inputs, grad_outputs=None): **ONLY available in the static mode and primitive operators** Args: - outputs (Tensor|Sequence[Tensor]): The output Tensor or Tensors. - inputs (Tensor|Sequence[Tensor]): The input Tensor or Tensors. - grad_outputs (Tensor|Sequence[Tensor]): The gradient Tensor or + outputs(Tensor|Sequence[Tensor]): The output Tensor or Tensors. + inputs(Tensor|Sequence[Tensor]): The input Tensor or Tensors. + grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or Tensors of outputs which has the same shape with outputs, Defaults - to None, in this case is equivalent to all ones . + to None, in this case is equivalent to all ones. Returns: - grad_inputs (Tensor|Tensors): The gradients for inputs. + grad_inputs(Tensor|Tensors): The gradients for inputs. Examples: @@ -114,8 +114,10 @@ def grad(outputs, inputs, grad_outputs=None): import numpy as np import paddle + paddle.enable_static() paddle.incubate.autograd.enable_prim() + startup_program = paddle.static.Program() main_program = paddle.static.Program() with paddle.static.program_guard(main_program, startup_program): @@ -124,11 +126,13 @@ def grad(outputs, inputs, grad_outputs=None): y = x * x x_grad = paddle.incubate.autograd.grad(y, x) paddle.incubate.autograd.prim2orig() + exe = paddle.static.Executor() exe.run(startup_program) x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad]) print(x_grad) # [array([4.], dtype=float32)] + paddle.incubate.autograd.disable_prim() paddle.disable_static() """ From 08cada982c567d452ed71f4a228e2f5785e5f601 Mon Sep 17 00:00:00 2001 From: RichardWooSJTU <37864677+RichardWooSJTU@users.noreply.github.com> Date: Mon, 18 Jul 2022 21:40:45 +0800 Subject: [PATCH 249/250] fix build error in low arch (#44391) --- .../inference/tensorrt/plugin/fused_token_prune_op_plugin.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu index 627ef44e6fd75..c10ab7277e788 100644 --- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu @@ -38,10 +38,12 @@ __global__ void ElementwiseMask(const T* a, const T* b, T* res, int num_elements) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) auto tid = threadIdx.x + blockIdx.x * blockDim.x; if (tid >= num_elements) return; const T zero = 0; res[tid] = b[tid] >= zero ? a[tid] : zero; +#endif } template @@ -121,6 +123,7 @@ __global__ void ReduceSum2( template <> __global__ void ReduceSum2( const half* src, half* dst, int bsz, int nb_head, int max_seq_len) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) int tid = threadIdx.x; int bid = blockIdx.x; int num_blocks_per_head = ((max_seq_len / blockDim.x) * max_seq_len); @@ -152,6 +155,7 @@ __global__ void ReduceSum2( static_cast(bsz * max_seq_len), static_cast(res_half[0])); } +#endif } template From 130c108ad979ba8dd44b72eeb7c5f498de253bdd Mon Sep 17 00:00:00 2001 From: JYChen Date: Tue, 19 Jul 2022 10:43:24 +0800 Subject: [PATCH 250/250] [new api] add new api paddle.vision.ops.distribute_fpn_proposals (#43736) * add distribute_fpn_proposals * change to new dygraph * fix doc and example code * change fluid impl to current version --- python/paddle/fluid/layers/detection.py | 55 ++------ .../test_distribute_fpn_proposals_op.py | 61 ++++++++- python/paddle/vision/ops.py | 118 ++++++++++++++++++ 3 files changed, 187 insertions(+), 47 deletions(-) diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index f89c95b93a1d3..aa6df245480cb 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -17,6 +17,8 @@ from __future__ import print_function +import paddle + from .layer_function_generator import generate_layer_fn from .layer_function_generator import autodoc, templatedoc from ..layer_helper import LayerHelper @@ -3774,52 +3776,13 @@ def distribute_fpn_proposals(fpn_rois, refer_level=4, refer_scale=224) """ - num_lvl = max_level - min_level + 1 - - if _non_static_mode(): - assert rois_num is not None, "rois_num should not be None in dygraph mode." - attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level', - refer_level, 'refer_scale', refer_scale) - multi_rois, restore_ind, rois_num_per_level = _C_ops.distribute_fpn_proposals( - fpn_rois, rois_num, num_lvl, num_lvl, *attrs) - return multi_rois, restore_ind, rois_num_per_level - - check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'], - 'distribute_fpn_proposals') - helper = LayerHelper('distribute_fpn_proposals', **locals()) - dtype = helper.input_dtype('fpn_rois') - multi_rois = [ - helper.create_variable_for_type_inference(dtype) for i in range(num_lvl) - ] - - restore_ind = helper.create_variable_for_type_inference(dtype='int32') - - inputs = {'FpnRois': fpn_rois} - outputs = { - 'MultiFpnRois': multi_rois, - 'RestoreIndex': restore_ind, - } - - if rois_num is not None: - inputs['RoisNum'] = rois_num - rois_num_per_level = [ - helper.create_variable_for_type_inference(dtype='int32') - for i in range(num_lvl) - ] - outputs['MultiLevelRoIsNum'] = rois_num_per_level - - helper.append_op(type='distribute_fpn_proposals', - inputs=inputs, - outputs=outputs, - attrs={ - 'min_level': min_level, - 'max_level': max_level, - 'refer_level': refer_level, - 'refer_scale': refer_scale - }) - if rois_num is not None: - return multi_rois, restore_ind, rois_num_per_level - return multi_rois, restore_ind + return paddle.vision.ops.distribute_fpn_proposals(fpn_rois=fpn_rois, + min_level=min_level, + max_level=max_level, + refer_level=refer_level, + refer_scale=refer_scale, + rois_num=rois_num, + name=name) @templatedoc() diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py index 06cdaed1988cc..7950c2784221f 100644 --- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py +++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ import numpy as np import math import sys +import paddle + from op_test import OpTest @@ -164,5 +166,62 @@ def init_test_case(self): self.pixel_offset = False +class TestDistributeFpnProposalsAPI(unittest.TestCase): + + def setUp(self): + np.random.seed(678) + self.rois_np = np.random.rand(10, 4).astype('float32') + self.rois_num_np = np.array([4, 6]).astype('int32') + + def test_dygraph_with_static(self): + paddle.enable_static() + rois = paddle.static.data(name='rois', shape=[10, 4], dtype='float32') + rois_num = paddle.static.data(name='rois_num', + shape=[None], + dtype='int32') + multi_rois, restore_ind, rois_num_per_level = paddle.vision.ops.distribute_fpn_proposals( + fpn_rois=rois, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224, + rois_num=rois_num) + fetch_list = multi_rois + [restore_ind] + rois_num_per_level + + exe = paddle.static.Executor() + output_stat = exe.run(paddle.static.default_main_program(), + feed={ + 'rois': self.rois_np, + 'rois_num': self.rois_num_np + }, + fetch_list=fetch_list, + return_numpy=False) + output_stat_np = [] + for output in output_stat: + output_np = np.array(output) + if len(output_np) > 0: + output_stat_np.append(output_np) + + paddle.disable_static() + rois_dy = paddle.to_tensor(self.rois_np) + rois_num_dy = paddle.to_tensor(self.rois_num_np) + multi_rois_dy, restore_ind_dy, rois_num_per_level_dy = paddle.vision.ops.distribute_fpn_proposals( + fpn_rois=rois_dy, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224, + rois_num=rois_num_dy) + output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy + output_dy_np = [] + for output in output_dy: + output_np = output.numpy() + if len(output_np) > 0: + output_dy_np.append(output_np) + + for res_stat, res_dy in zip(output_stat_np, output_dy_np): + self.assertTrue(np.allclose(res_stat, res_dy)) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 7febf4f740ea2..545ba25f5b420 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -28,6 +28,7 @@ 'yolo_box', 'deform_conv2d', 'DeformConv2D', + 'distribute_fpn_proposals', 'read_file', 'decode_jpeg', 'roi_pool', @@ -835,6 +836,123 @@ def forward(self, x, offset, mask=None): return out +def distribute_fpn_proposals(fpn_rois, + min_level, + max_level, + refer_level, + refer_scale, + pixel_offset=False, + rois_num=None, + name=None): + r""" + In Feature Pyramid Networks (FPN) models, it is needed to distribute + all proposals into different FPN level, with respect to scale of the proposals, + the referring scale and the referring level. Besides, to restore the order of + proposals, we return an array which indicates the original index of rois + in current proposals. To compute FPN level for each roi, the formula is given as follows: + + .. math:: + roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} + level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level) + where BBoxArea is a function to compute the area of each roi. + + Args: + fpn_rois (Tensor): The input fpn_rois. 2-D Tensor with shape [N, 4] and data type can be + float32 or float64. + min_level (int): The lowest level of FPN layer where the proposals come + from. + max_level (int): The highest level of FPN layer where the proposals + come from. + refer_level (int): The referring level of FPN layer with specified scale. + refer_scale (int): The referring scale of FPN layer with specified level. + pixel_offset (bool, optional): Whether there is pixel offset. If True, the offset of + image shape will be 1. 'False' by default. + rois_num (Tensor, optional): 1-D Tensor contains the number of RoIs in each image. + The shape is [B] and data type is int32. B is the number of images. + If rois_num not None, it will return a list of 1-D Tensor. Each element + is the output RoIs' number of each image on the corresponding level + and the shape is [B]. None by default. + name (str, optional): For detailed information, please refer + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. + + Returns: + multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is + and data type is same as `fpn_rois` . The length is max_level-min_level+1. + restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1] + , where N is the number of total rois. The data type is int32. + rois_num_per_level (List): A list of 1-D Tensor and each Tensor is + the RoIs' number in each image on the corresponding level. The shape + is [B] and data type of int32, where B is the number of images. + + Examples: + .. code-block:: python + + import paddle + + fpn_rois = paddle.rand((10, 4)) + rois_num = paddle.to_tensor([3, 1, 4, 2], dtype=paddle.int32) + + multi_rois, restore_ind, rois_num_per_level = paddle.vision.ops.distribute_fpn_proposals( + fpn_rois=fpn_rois, + min_level=2, + max_level=5, + refer_level=4, + refer_scale=224, + rois_num=rois_num) + """ + num_lvl = max_level - min_level + 1 + + if _non_static_mode(): + assert rois_num is not None, "rois_num should not be None in dygraph mode." + attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level', + refer_level, 'refer_scale', refer_scale, 'pixel_offset', + pixel_offset) + multi_rois, restore_ind, rois_num_per_level = _C_ops.distribute_fpn_proposals( + fpn_rois, rois_num, num_lvl, num_lvl, *attrs) + return multi_rois, restore_ind, rois_num_per_level + + else: + check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'], + 'distribute_fpn_proposals') + helper = LayerHelper('distribute_fpn_proposals', **locals()) + dtype = helper.input_dtype('fpn_rois') + multi_rois = [ + helper.create_variable_for_type_inference(dtype) + for i in range(num_lvl) + ] + + restore_ind = helper.create_variable_for_type_inference(dtype='int32') + + inputs = {'FpnRois': fpn_rois} + outputs = { + 'MultiFpnRois': multi_rois, + 'RestoreIndex': restore_ind, + } + + if rois_num is not None: + inputs['RoisNum'] = rois_num + rois_num_per_level = [ + helper.create_variable_for_type_inference(dtype='int32') + for i in range(num_lvl) + ] + outputs['MultiLevelRoIsNum'] = rois_num_per_level + else: + rois_num_per_level = None + + helper.append_op(type='distribute_fpn_proposals', + inputs=inputs, + outputs=outputs, + attrs={ + 'min_level': min_level, + 'max_level': max_level, + 'refer_level': refer_level, + 'refer_scale': refer_scale, + 'pixel_offset': pixel_offset + }) + return multi_rois, restore_ind, rois_num_per_level + + def read_file(filename, name=None): """ Reads and outputs the bytes contents of a file as a uint8 Tensor