diff --git a/paddle/fluid/distributed/collective/Common.cc b/paddle/fluid/distributed/collective/Common.cc index 4a883f8196389..3461efbf3aa9a 100644 --- a/paddle/fluid/distributed/collective/Common.cc +++ b/paddle/fluid/distributed/collective/Common.cc @@ -17,11 +17,11 @@ namespace paddle { namespace distributed { -std::vector GetPlaceList(const std::vector& tensors) { +std::vector GetPlaceList(const std::vector& tensors) { std::vector places; places.reserve(tensors.size()); for (auto& tensor : tensors) { - places.push_back(tensor.inner_place()); + places.push_back(tensor.place()); } return places; } @@ -40,15 +40,11 @@ std::string GetKeyFromPlaces(const std::vector& places) { return placeList; } -static bool CheckTensorsInPlace(const std::vector& tensors, - phi::AllocationType type) { - return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { - return t.place().GetType() == type; - }); -} - -bool CheckTensorsInCudaPlace(const std::vector& tensors) { - return CheckTensorsInPlace(tensors, phi::AllocationType::GPU); +bool CheckTensorsInCudaPlace(const std::vector& tensors) { + return std::all_of(tensors.cbegin(), tensors.cend(), + [&](const phi::DenseTensor& t) { + return platform::is_gpu_place(t.place()); + }); } } // namespace distributed diff --git a/paddle/fluid/distributed/collective/Common.h b/paddle/fluid/distributed/collective/Common.h index 9569f4c61acef..c01bd23fe127b 100644 --- a/paddle/fluid/distributed/collective/Common.h +++ b/paddle/fluid/distributed/collective/Common.h @@ -16,18 +16,18 @@ #include "paddle/fluid/platform/place.h" #include "paddle/phi/api/include/api.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" namespace paddle { namespace distributed { -using Tensor = paddle::experimental::Tensor; - using Place = paddle::platform::Place; // Get the list of devices from list of tensors -std::vector GetPlaceList(const std::vector& tensors); +std::vector GetPlaceList(const std::vector& tensors); // Get the deviceList String from the list of devices std::string GetKeyFromPlaces(const std::vector& places); -bool CheckTensorsInCudaPlace(const std::vector& tensors); +bool CheckTensorsInCudaPlace(const std::vector& tensors); } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc index 6da83a888683b..6fec3a41e1047 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.cc +++ b/paddle/fluid/distributed/collective/ProcessGroup.cc @@ -17,7 +17,8 @@ namespace paddle { namespace distributed { -ProcessGroup::Task::Task(int rank, const std::vector& inputTensors, +ProcessGroup::Task::Task(int rank, + const std::vector& inputTensors, CommType comm_type) : rank_(rank), comm_type_(comm_type) {} diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index 17d021852671e..fbc9c1f476202 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -54,7 +54,7 @@ class ProcessGroup { public: class Task { public: - Task(int rank, const std::vector& inputTensors, + Task(int rank, const std::vector& inputTensors, CommType opType = CommType::UNKNOWN); virtual ~Task(); @@ -79,25 +79,21 @@ class ProcessGroup { virtual const std::string GetBackendName() const = 0; virtual std::shared_ptr AllReduce( - std::vector& /* tensors */, + std::vector& /* input tensors */, // NOLINT + std::vector& /* output tensors */, // NOLINT const AllreduceOptions& = AllreduceOptions()) { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support allreduce", GetBackendName())); } virtual std::shared_ptr Broadcast( - std::vector& /* tensors */, + std::vector& /* input tensors */, // NOLINT + std::vector& /* output tensors */, // NOLINT const BroadcastOptions& = BroadcastOptions()) { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support broadcast", GetBackendName())); } - virtual void Broadcast(const phi::DenseTensor* in, phi::DenseTensor* out) { - PADDLE_THROW(platform::errors::Fatal( - "ProcessGroup%s does not support broadcast for static mode runtime", - GetBackendName())); - } - virtual std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) { PADDLE_THROW(platform::errors::InvalidArgument( @@ -105,42 +101,43 @@ class ProcessGroup { } virtual std::shared_ptr Send( - std::vector& tensors /* tensors */, int dst_rank) { // NOLINT + std::vector&, int) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support send", GetBackendName())); } virtual std::shared_ptr Recv( - std::vector& tensors /* tensors */, int src_rank) { // NOLINT + std::vector& tensors, int) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support receive", GetBackendName())); } virtual std::shared_ptr AllGather( - std::vector& in_tensors /* tensors */, // NOLINT - std::vector& out_tensors /* tensors */) { // NOLINT + std::vector&, // NOLINT + std::vector&) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support AllGather", GetBackendName())); } virtual std::shared_ptr AllToAll( - std::vector& in /* tensors */, // NOLINT - std::vector& out /* tensors */) { // NOLINT + std::vector&, // NOLINT + std::vector&) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support AllToAll", GetBackendName())); } virtual std::shared_ptr Reduce( - std::vector& tensors /* tensors */, // NOLINT - const ReduceOptions& opts) { // NOLINT + std::vector&, // NOLINT + std::vector&, // NOLINT + const ReduceOptions& opts) { PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support Reduce", GetBackendName())); } virtual std::shared_ptr Scatter( - std::vector& in_tensors /* tensors */, // NOLINT - std::vector& out_tensors /* tensors */, // NOLINT - const ScatterOptions&) { // NOLINT + std::vector&, // NOLINT + std::vector&, // NOLINT + const ScatterOptions&) { // NOLINT PADDLE_THROW(platform::errors::InvalidArgument( "ProcessGroup%s does not support Scatter", GetBackendName())); } diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc index 91c3bf93849e0..6ddea74d95db6 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -27,6 +27,7 @@ #include #include #include +#include "paddle/fluid/distributed/collective/Common.h" #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #include "paddle/fluid/platform/enforce.h" @@ -105,107 +106,104 @@ reduce_func get_function(const ReduceOp& r) { exit(-1); } -bool CheckTensorsInCPUPlace(const std::vector& tensors) { - return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { - return t.place() == PlaceType::kCPU; - }); -} - template -T* get_data(const Tensor& tensor) { - auto raw_tensor = std::dynamic_pointer_cast(tensor.impl()); - return static_cast(raw_tensor->data()); +T* get_data(phi::DenseTensor& tensor) { // NOLINT + return reinterpret_cast(tensor.data()); } template -std::vector get_multi_data(const std::vector& tensors) { - std::vector ret(tensors.size()); +std::vector get_multi_data( + std::vector& tensors) { // NOLINT + std::vector ret; + ret.reserve(tensors.size()); for (size_t i = 0; i < tensors.size(); i++) { - ret[i] = get_data(tensors[i]); + ret.push_back(get_data(tensors[i])); } return ret; } template -void set_output(P& opts, const Tensor& tensor) { // NOLINT +void set_output(P& opts, phi::DenseTensor& tensor) { // NOLINT opts.setOutput(get_data(tensor), tensor.numel()); } template -void set_input(P& opts, const Tensor& tensor) { // NOLINT +void set_input(P& opts, phi::DenseTensor& tensor) { // NOLINT opts.setInput(get_data(tensor), tensor.numel()); } template -void set_outputs(P& opts, const std::vector& tensors) { // NOLINT +void set_outputs(P& opts, // NOLINT + std::vector& tensors) { // NOLINT opts.setOutputs(get_multi_data(tensors), tensors[0].numel()); } template -void set_inputs(P& opts, const std::vector& tensors) { // NOLINT +void set_inputs(P& opts, // NOLINT + std::vector& tensors) { // NOLINT opts.setInputs(get_multi_data(tensors), tensors[0].numel()); } template -void set_inputs_for_scatter(P& opts, // NOLINT - const std::vector& tensors, // NOLINT +void set_inputs_for_scatter(P& opts, // NOLINT + phi::DenseTensor& tensor, // NOLINT int nranks) { - std::vector ret(nranks); - auto raw_tensor = - std::dynamic_pointer_cast(tensors[0].impl()); - T* raw_pointer = reinterpret_cast(raw_tensor->data()); + std::vector ret; + ret.reserve(nranks); + T* raw_pointer = reinterpret_cast(tensor.data()); size_t offset = 0; for (int i = 0; i < nranks; i++) { - ret[i] = raw_pointer + offset; - offset += tensors[0].numel() / nranks; + ret.push_back(raw_pointer + offset); + offset += tensor.numel() / nranks; } - opts.setInputs(ret, tensors[0].numel() / nranks); + opts.setInputs(ret, tensor.numel() / nranks); } -ProcessGroupGloo::GlooTask::GlooTask(int rank, - const std::vector& inputs, - CommType comm_type) - : ProcessGroup::Task(rank, inputs, comm_type) { - PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true, - platform::errors::Fatal( - "Only CPU place is supported for ProcessGroupGloo.")); -} +ProcessGroupGloo::GlooTask::GlooTask( + int rank, const std::vector& inputs, CommType comm_type) + : ProcessGroup::Task(rank, inputs, comm_type) {} ProcessGroupGloo::ProcessGroupGloo( - const std::shared_ptr& store, int rank, - int world_size, int gid, const std::shared_ptr options) + const std::shared_ptr& store, int rank, int world_size, + int gid, const std::shared_ptr options) : ProcessGroup(rank, world_size, gid), _tag(0), _store(new GlooStore(store)) { _context = std::make_shared(rank, world_size); auto prefix_store = - ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store); + ::gloo::rendezvous::PrefixStore(std::to_string(gid), *_store); _context->connectFullMesh(prefix_store, options->device); } class BroadcastGlooTask : public ProcessGroupGloo::GlooTask { public: BroadcastGlooTask(const std::shared_ptr& context, - const std::vector& inputs, int rank, int root, - uint32_t tag) + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + int rank, int root, uint32_t tag) : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST), _context(context), _root(root), _inputs(inputs), + _outputs(outputs), _tag(tag) {} - void Run() override { _do_broadcast(_inputs[0]); } + void Run() override { _do_broadcast(_inputs[0], _outputs[0]); } private: std::shared_ptr _context; const int _root; - std::vector _inputs{}; + std::vector _inputs{}; + std::vector _outputs{}; const uint32_t _tag; - void _do_broadcast(const Tensor& tensor) { + void _do_broadcast(phi::DenseTensor& in, phi::DenseTensor& out) { // NOLINT gloo::BroadcastOptions opts(_context); - const auto& dtype = tensor.type(); - GENERATE_FUNC(dtype, set_output, opts, tensor); + const auto& dtype = in.dtype(); + if (rank_ == _root) { + GENERATE_FUNC(dtype, set_input, opts, in); + } + GENERATE_FUNC(dtype, set_output, opts, out); opts.setRoot(_root); opts.setTag(_tag); gloo::broadcast(opts); @@ -213,12 +211,14 @@ class BroadcastGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::Broadcast( - std::vector& inputs, const BroadcastOptions& opts) { + std::vector& inputs, + std::vector& outputs, const BroadcastOptions& opts) { auto root = opts.source_rank; std::unique_ptr task; auto tag = next_tag(); auto context = get_context(); - task = std::make_unique(context, inputs, rank_, root, tag); + task = std::make_unique(context, inputs, outputs, rank_, + root, tag); task->Run(); return task; } @@ -226,19 +226,22 @@ std::shared_ptr ProcessGroupGloo::Broadcast( class AllreduceGlooTask : public ProcessGroupGloo::GlooTask { public: AllreduceGlooTask(int rank, const std::shared_ptr& context, - std::vector& inputs, ReduceOp reduce_op, // NOLINT - uint32_t tag) + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + ReduceOp reduce_op, uint32_t tag) : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE), _context(context), _inputs(inputs), + _outputs(outputs), _reduce_op(reduce_op), _tag(tag) {} - void Run() override { _do_allreduce(_inputs); } + void Run() override { _do_allreduce(_inputs, _outputs); } private: std::shared_ptr _context; - std::vector _inputs; + std::vector _inputs; + std::vector _outputs; const ReduceOp _reduce_op; uint32_t _tag; @@ -255,11 +258,12 @@ class AllreduceGlooTask : public ProcessGroupGloo::GlooTask { fn = get_function(op); } - void _do_allreduce(std::vector& tensors) { // NOLINT - const auto& dtype = tensors[0].type(); + void _do_allreduce(std::vector& ins, // NOLINT + std::vector& outs) { // NOLINT + const auto& dtype = ins[0].dtype(); gloo::AllreduceOptions opts(_context); - GENERATE_FUNC(dtype, set_inputs, opts, tensors); - GENERATE_FUNC(dtype, set_outputs, opts, tensors); + GENERATE_FUNC(dtype, set_inputs, opts, ins); + GENERATE_FUNC(dtype, set_outputs, opts, outs); opts.setReduceFunction(_get_function(dtype, _reduce_op)); opts.setTag(_tag); gloo::allreduce(opts); @@ -267,11 +271,12 @@ class AllreduceGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::AllReduce( - std::vector& inputs, const AllreduceOptions& opts) { + std::vector& inputs, + std::vector& outputs, const AllreduceOptions& opts) { auto tag = next_tag(); std::shared_ptr task; auto context = get_context(); - task = std::make_shared(rank_, context, inputs, + task = std::make_shared(rank_, context, inputs, outputs, opts.reduce_op, tag); task->Run(); return task; @@ -280,7 +285,7 @@ std::shared_ptr ProcessGroupGloo::AllReduce( class BarrierGlooTask : public ProcessGroupGloo::GlooTask { public: BarrierGlooTask(int rank, const std::shared_ptr& context) - : ProcessGroupGloo::GlooTask(rank, std::vector{}, + : ProcessGroupGloo::GlooTask(rank, std::vector{}, CommType::BARRIER), _context(context) {} @@ -307,8 +312,8 @@ std::shared_ptr ProcessGroupGloo::Barrier( class AllgatherGlooTask : public ProcessGroupGloo::GlooTask { public: AllgatherGlooTask(int rank, const std::shared_ptr& context, - std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT uint32_t tag) : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER), _context(context), @@ -320,13 +325,13 @@ class AllgatherGlooTask : public ProcessGroupGloo::GlooTask { private: std::shared_ptr _context; - std::vector _inputs; - std::vector _outputs; + std::vector _inputs; + std::vector _outputs; uint32_t _tag; - void _do_allgather(std::vector& in, // NOLINT - std::vector& out) { // NOLINT - const auto& dtype = in[0].type(); + void _do_allgather(std::vector& in, // NOLINT + std::vector& out) { // NOLINT + const auto& dtype = in[0].dtype(); gloo::AllgatherOptions opts(_context); GENERATE_FUNC(dtype, set_input, opts, in[0]); GENERATE_FUNC(dtype, set_output, opts, out[0]); @@ -336,7 +341,8 @@ class AllgatherGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::AllGather( - std::vector& in_tensors, std::vector& out_tensors) { + std::vector& in_tensors, + std::vector& out_tensors) { std::shared_ptr task; auto tag = next_tag(); auto context = get_context(); @@ -349,20 +355,23 @@ std::shared_ptr ProcessGroupGloo::AllGather( class ReduceGlooTask : public ProcessGroupGloo::GlooTask { public: ReduceGlooTask(int rank, const std::shared_ptr& context, - std::vector& in, ReduceOp reduce_op, // NOLINT - int dst, uint32_t tag) - : ProcessGroupGloo::GlooTask(rank, in, CommType::REDUCE), + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + ReduceOp reduce_op, int dst, uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::REDUCE), _context(context), - _inputs(in), + _inputs(inputs), + _outputs(outputs), _reduce_op(reduce_op), _dst(dst), _tag(tag) {} - void Run() override { _do_reduce(_inputs, _dst); } + void Run() override { _do_reduce(_inputs, _outputs, _dst); } private: std::shared_ptr _context; - std::vector _inputs; + std::vector _inputs; + std::vector _outputs; const ReduceOp _reduce_op; int _dst; uint32_t _tag; @@ -380,11 +389,13 @@ class ReduceGlooTask : public ProcessGroupGloo::GlooTask { fn = get_function(op); } - void _do_reduce(std::vector& tensors, int dst) { // NOLINT - const auto& dtype = tensors[0].type(); + void _do_reduce(std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + int dst) { + const auto& dtype = inputs[0].dtype(); gloo::ReduceOptions opts(_context); - GENERATE_FUNC(dtype, set_input, opts, tensors[0]); - GENERATE_FUNC(dtype, set_output, opts, tensors[0]); + GENERATE_FUNC(dtype, set_input, opts, inputs[0]); + GENERATE_FUNC(dtype, set_output, opts, outputs[0]); opts.setReduceFunction(_get_function(dtype, _reduce_op)); opts.setTag(_tag); opts.setRoot(dst); @@ -393,11 +404,12 @@ class ReduceGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::Reduce( - std::vector& tensors, const ReduceOptions& opts) { + std::vector& inputs, + std::vector& outputs, const ReduceOptions& opts) { std::shared_ptr task; auto tag = next_tag(); auto context = get_context(); - task = std::make_shared(rank_, context, tensors, + task = std::make_shared(rank_, context, inputs, outputs, opts.reduce_op, opts.root_rank, tag); task->Run(); return task; @@ -406,8 +418,8 @@ std::shared_ptr ProcessGroupGloo::Reduce( class ScatterGlooTask : public ProcessGroupGloo::GlooTask { public: ScatterGlooTask(int rank, const std::shared_ptr& context, - std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT int src, int size, uint32_t tag) : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER), _context(context), @@ -421,18 +433,19 @@ class ScatterGlooTask : public ProcessGroupGloo::GlooTask { private: std::shared_ptr _context; - std::vector _inputs; - std::vector _outputs; + std::vector _inputs; + std::vector _outputs; int _src; int _size; uint32_t _tag; - void _do_scatter(std::vector& in, std::vector& out, // NOLINT + void _do_scatter(std::vector& in, // NOLINT + std::vector& out, // NOLINT int src) { - const auto& dtype = in[0].type(); + const auto& dtype = in[0].dtype(); gloo::ScatterOptions opts(_context); if (rank_ == src) { - GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in, _size); + GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in[0], _size); } GENERATE_FUNC(dtype, set_output, opts, out[0]); opts.setRoot(src); @@ -442,8 +455,8 @@ class ScatterGlooTask : public ProcessGroupGloo::GlooTask { }; std::shared_ptr ProcessGroupGloo::Scatter( - std::vector& in_tensors, std::vector& out_tensors, - const ScatterOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const ScatterOptions& opts) { std::shared_ptr task; auto tag = next_tag(); auto context = get_context(); diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h index f0bf872cfc9e4..335ca1bd17f2c 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -36,7 +36,8 @@ class ProcessGroupGloo : public ProcessGroup { class GlooTask : public ProcessGroup::Task, public std::enable_shared_from_this { public: - explicit GlooTask(int rank, const std::vector& input_tensors, + explicit GlooTask(int rank, + const std::vector& input_tensors, CommType comm_type); ~GlooTask() = default; @@ -106,26 +107,31 @@ class ProcessGroupGloo : public ProcessGroup { ~ProcessGroupGloo() = default; std::shared_ptr Broadcast( - std::vector& inputs, + std::vector& inputs, + std::vector& outputs, const BroadcastOptions& = BroadcastOptions()) override; std::shared_ptr AllReduce( - std::vector& inputs, + std::vector& inputs, + std::vector& outputs, const AllreduceOptions& opts = AllreduceOptions()) override; std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; std::shared_ptr AllGather( - std::vector& in_tensors, - std::vector& out_tensors) override; + std::vector& in_tensors, + std::vector& out_tensors) override; std::shared_ptr Reduce( - std::vector& tensors, const ReduceOptions& opts) override; - - std::shared_ptr Scatter(std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions&) override; + std::vector& in_tensors, + std::vector& out_tensors, + const ReduceOptions& opts) override; + + std::shared_ptr Scatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; std::shared_ptr<::gloo::Context> get_context() { return _context; } uint64_t next_tag() { return _tag++; } diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc index 55945b5e0e396..55ecdaaf6bfb7 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -44,14 +44,14 @@ void SyncDefaultStream( std::shared_ptr ProcessGroupHCCL::CreateTask( std::vector places, int rank, CommType comm_type, - const std::vector& inputs) { + const std::vector& inputs) { return std::make_shared(places, rank, comm_type, inputs); } -ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector& places, int rank, - CommType CommType, - const std::vector& inputs) +ProcessGroupHCCL::HCCLTask::HCCLTask( + const std::vector& places, int rank, CommType CommType, + const std::vector& inputs) : Task(rank, inputs, CommType), places_(places) { control_events_.resize(places.size()); hcclComms_.resize(places.size()); @@ -60,8 +60,8 @@ ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector& places, int rank, ProcessGroupHCCL::HCCLTask::~HCCLTask() {} void ProcessGroupHCCL::HCCLTask::SetOutputs( - std::vector& outputs) { // NOLINT - outputs_ = std::make_shared>(outputs); + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); } void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() { @@ -166,8 +166,8 @@ void ProcessGroupHCCL::CreateHCCLManagerCache( template std::shared_ptr ProcessGroupHCCL::Collective( - std::vector& inputs, std::vector& outputs, Fn fn, - CommType op_type) { + std::vector& inputs, + std::vector& outputs, Fn fn, CommType op_type) { const auto places = GetPlaceList(inputs); const auto key = GetKeyFromPlaces(places); @@ -208,91 +208,44 @@ std::shared_ptr ProcessGroupHCCL::Collective( return task; } -template -std::shared_ptr ProcessGroupHCCL::PointToPoint( - std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { - const auto places = GetPlaceList(tensors); - const auto key = GetKeyFromPlaces(places); - - { - std::lock_guard lock(mutex_); - if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { - CreateHCCLManagerCache(key, places); - } - } - - auto& hccl_comms = places_to_hcclcomm_[key]; - - SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); - - auto task = CreateTask(places, rank_, op_type, tensors); - - // construct uninitialize guard for device - - // if (FLAGS_use_stream_safe_npu_allocator) { - // for (size_t i = 0; i < tensors.size(); ++i) { - // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); - // auto dense_tensor = - // std::dynamic_pointer_cast(tensors[i].impl()); - // memory::RecordStream(dense_tensor->Holder(), - // places_to_ctx_[key][i]->stream()); - // } - // } - - for (size_t i = 0; i < tensors.size(); ++i) { - platform::NPUDeviceGuard guard(places[i].GetDeviceId()); - const auto& hccl_stream = places_to_ctx_[key][i]->stream(); - fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank); - } - - for (size_t i = 0; i < tensors.size(); ++i) { - platform::NPUDeviceGuard guard(places[i].GetDeviceId()); - task->control_events_[i].Record(*places_to_ctx_[key][i]); - } - return task; -} - std::shared_ptr ProcessGroupHCCL::AllReduce( - std::vector& tensors, const AllreduceOptions& opts) { - // PADDLE_ENFORCE_EQ( - // CheckTensorsInNPUPlace(tensors), true, - // platform::errors::InvalidArgument("All inputs should be in - // NPUPlace.")); - return Collective( - tensors, tensors, - [&](const Tensor& input, Tensor& output, HcclComm comm, - const aclrtStream& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::HcclAllReduce( - input_tensor->data(), output_tensor->data(), input_tensor->numel(), - platform::ToHCCLDataType(input.type()), - ToHCCLRedType(opts.reduce_op), comm, stream); - }, - CommType::ALLREDUCE); + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const AllreduceOptions& opts) { + return Collective(in_tensors, out_tensors, + [&](phi::DenseTensor& input, phi::DenseTensor& output, + HcclComm comm, const aclrtStream& stream) { + return platform::dynload::HcclAllReduce( + input.data(), output.data(), input.numel(), + platform::ToHCCLDataType(input.dtype()), + ToHCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); } std::shared_ptr ProcessGroupHCCL::Broadcast( - std::vector& tensors, const BroadcastOptions& opts) { + std::vector& in_tensors, // NOLINT + std::vector& out_tensors, // NOLINT + const BroadcastOptions& opts) { // PADDLE_ENFORCE_EQ( // CheckTensorsInNPUPlace(tensors), true, // platform::errors::InvalidArgument("All inputs should be in // CudaPlace.")); return Collective( - tensors, tensors, - [&](Tensor& input, Tensor& output, HcclComm comm, + in_tensors, out_tensors, + [&](phi::DenseTensor& input, phi::DenseTensor& output, HcclComm comm, const aclrtStream& stream) { - const auto root = opts.source_rank * tensors.size() + opts.source_root; - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::HcclBroadcast( - input_tensor->data(), input_tensor->numel(), - platform::ToHCCLDataType(input.type()), root, comm, stream); + int root = opts.source_rank * in_tensors.size() + opts.source_root; + if (rank_ == root) { + return platform::dynload::HcclBroadcast( + input.data(), input.numel(), + platform::ToHCCLDataType(input.dtype()), root, comm, stream); + } else { + return platform::dynload::HcclBroadcast( + output.data(), output.numel(), + platform::ToHCCLDataType(output.dtype()), root, comm, stream); + } }, CommType::BROADCAST); } diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h index 932ae75fc6b9d..f3d3fa2f8a72a 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -46,7 +46,7 @@ class ProcessGroupHCCL : public ProcessGroup { public std::enable_shared_from_this { public: HCCLTask(const std::vector& places, int rank, CommType CommType, - const std::vector& inputs); + const std::vector& inputs); bool IsCompleted(); @@ -56,7 +56,7 @@ class ProcessGroupHCCL : public ProcessGroup { void Synchronize(); - void SetOutputs(std::vector& outputs); // NOLINT + void SetOutputs(std::vector& outputs); // NOLINT virtual ~HCCLTask(); @@ -65,7 +65,7 @@ class ProcessGroupHCCL : public ProcessGroup { protected: std::vector places_; std::vector> hcclComms_; - std::shared_ptr> outputs_; + std::shared_ptr> outputs_; private: }; @@ -78,17 +78,19 @@ class ProcessGroupHCCL : public ProcessGroup { } std::shared_ptr AllReduce( - std::vector& tensors, + std::vector& in_tensors, + std::vector& out_tensors, const AllreduceOptions& = AllreduceOptions()) override; std::shared_ptr Broadcast( - std::vector& tensors, + std::vector& in_tensors, + std::vector& out_tensors, const BroadcastOptions& = BroadcastOptions()) override; protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, - const std::vector& inputs); + const std::vector& inputs); std::shared_ptr store_; std::shared_ptr hccl_comm_; @@ -113,15 +115,10 @@ class ProcessGroupHCCL : public ProcessGroup { template std::shared_ptr Collective( - std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT Fn fn, CommType op_type); - template - std::shared_ptr PointToPoint( - std::vector& tensors, // NOLINT - Fn fn, int dst_rank, CommType op_type); - void CreateHCCLManagerCache(const std::string& places_key, const std::vector& places); }; diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc index b3c9ddde50116..a48bda06323be 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHeter.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.cc @@ -26,13 +26,13 @@ namespace distributed { using Place = paddle::platform::Place; std::shared_ptr ProcessGroupHeter::CreateTask( - int rank, CommType comm_type, const std::vector& inputs) { + int rank, CommType comm_type, const std::vector& inputs) { return std::make_shared(rank, comm_type, inputs); } -ProcessGroupHeter::HeterTask::HeterTask(int rank, CommType CommType, - const std::vector& inputs) +ProcessGroupHeter::HeterTask::HeterTask( + int rank, CommType CommType, const std::vector& inputs) : Task(rank, inputs, CommType) {} ProcessGroupHeter::HeterTask::~HeterTask() {} @@ -86,248 +86,177 @@ static void _do_add(T* dst, T* src, size_t size) { } std::shared_ptr ProcessGroupHeter::AllReduce( - std::vector& tensors, const AllreduceOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const AllreduceOptions& opts) { #if defined(PADDLE_WITH_NCCL) PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(tensors), true, + CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); #endif // Step1: do allreduce in inner cluster - auto task = inner_pg_->AllReduce(tensors, opts); + auto task = inner_pg_->AllReduce(in_tensors, in_tensors, opts); task->Wait(); // Step2: copy tensors to CPU if (local_rank_ == 0) { - std::vector cpu_tensors; - cpu_tensors.reserve(tensors.size()); - for (size_t i = 0; i < tensors.size(); i++) { - auto dense_gpu_tensor = - std::dynamic_pointer_cast(tensors[i].impl()); - phi::DenseTensorMeta meta = phi::DenseTensorMeta( - dense_gpu_tensor->dtype(), dense_gpu_tensor->dims()); - std::shared_ptr dense_cpu_tensor = - std::make_shared( - std::make_unique( - paddle::platform::CPUPlace()) - .get(), - meta); - dense_cpu_tensor->ResizeAndAllocate(dense_gpu_tensor->dims()); - cpu_tensors[i] = paddle::experimental::Tensor(dense_cpu_tensor); - framework::TensorCopySync(*dense_gpu_tensor, platform::CPUPlace(), - dense_cpu_tensor.get()); + std::vector cpu_tensors; + cpu_tensors.reserve(in_tensors.size()); + for (size_t i = 0; i < in_tensors.size(); i++) { + auto gpu_tensor = in_tensors[i]; + auto cpu_tensor = cpu_tensors[i]; + cpu_tensor.Resize(gpu_tensor.dims()); + framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor); } // Step3: do inter cluster allreduce if (with_switch_) { if (local_rank_ == 0) { HeterClient* client_ = HeterClient::GetInstance({switch_endpoint_}, {}, 0).get(); - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensors[0].impl()); + auto dense_cpu_tensor = cpu_tensors[0]; std::vector send_size; - send_size.push_back(dense_cpu_tensor->numel()); + send_size.push_back(dense_cpu_tensor.numel()); int ret = client_->Send( - gid_, {dense_cpu_tensor->name()}, send_size, - dense_cpu_tensor->data(), - dense_cpu_tensor->numel() * - framework::DataTypeSize(dense_cpu_tensor->dtype())); + gid_, {dense_cpu_tensor.name()}, send_size, dense_cpu_tensor.data(), + dense_cpu_tensor.numel() * + framework::DataTypeSize(dense_cpu_tensor.dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Send to the switch module error.")); phi::DenseTensorMeta meta = phi::DenseTensorMeta( - dense_cpu_tensor->dtype(), dense_cpu_tensor->dims()); + dense_cpu_tensor.dtype(), dense_cpu_tensor.dims()); std::shared_ptr dense_cpu_tensor2 = std::make_shared( std::make_unique( paddle::platform::CPUPlace()) .get(), meta); - dense_cpu_tensor2->ResizeAndAllocate(dense_cpu_tensor->dims()); - Tensor cpu_tensor_temp = - paddle::experimental::Tensor(dense_cpu_tensor2); + dense_cpu_tensor2->ResizeAndAllocate(dense_cpu_tensor.dims()); ret = client_->Recv( - gid_, {dense_cpu_tensor->name()}, dense_cpu_tensor2->data(), + gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor2->data(), dense_cpu_tensor2->numel() * framework::DataTypeSize(dense_cpu_tensor2->dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Recv from the switch module error.")); - switch (dense_cpu_tensor->dtype()) { + switch (dense_cpu_tensor.dtype()) { case DataType::FLOAT32: - _do_add(reinterpret_cast(dense_cpu_tensor->data()), + _do_add(reinterpret_cast(dense_cpu_tensor.data()), reinterpret_cast(dense_cpu_tensor2->data()), - dense_cpu_tensor->numel()); + dense_cpu_tensor.numel()); break; case DataType::FLOAT64: _do_add( - reinterpret_cast(dense_cpu_tensor->data()), + reinterpret_cast(dense_cpu_tensor.data()), reinterpret_cast(dense_cpu_tensor2->data()), - dense_cpu_tensor->numel()); + dense_cpu_tensor.numel()); break; case DataType::INT32: - _do_add(reinterpret_cast(dense_cpu_tensor->data()), + _do_add(reinterpret_cast(dense_cpu_tensor.data()), reinterpret_cast(dense_cpu_tensor2->data()), - dense_cpu_tensor->numel()); + dense_cpu_tensor.numel()); break; default: PADDLE_THROW(platform::errors::PreconditionNotMet( "Unsupported data type (%s) to do add.", - framework::DataType2String(dense_cpu_tensor->dtype()))); + framework::DataType2String(dense_cpu_tensor.dtype()))); } } } else { - auto gloo_task = inter_pg_->AllReduce(cpu_tensors, opts); + auto gloo_task = inter_pg_->AllReduce(cpu_tensors, cpu_tensors, opts); gloo_task->Wait(); } // Step4: copy cpu tensors to gpu // copy cpu tensors to gpu - for (size_t i = 0; i < tensors.size(); i++) { - auto dense_gpu_tensor = - std::dynamic_pointer_cast(tensors[i].impl()); - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensors[i].impl()); - framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(), - dense_gpu_tensor.get()); + for (size_t i = 0; i < in_tensors.size(); i++) { + auto gpu_tensor = out_tensors[i]; + auto cpu_tensor = cpu_tensors[i]; + framework::TensorCopySync(cpu_tensor, cpu_tensor.place(), &gpu_tensor); } } // Step5: broadcast among inner cluster auto b_opts = BroadcastOptions(); - b_opts.source_root = 0; - auto broadcast_task = inner_pg_->Broadcast(tensors, b_opts); + b_opts.source_rank = 0; + auto broadcast_task = inner_pg_->Broadcast(out_tensors, out_tensors, b_opts); broadcast_task->Wait(); - return CreateTask(rank_, CommType::ALLREDUCE, tensors); + return CreateTask(rank_, CommType::ALLREDUCE, in_tensors); } std::shared_ptr ProcessGroupHeter::Broadcast( - std::vector& tensors, const BroadcastOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const BroadcastOptions& opts) { #if defined(PADDLE_WITH_NCCL) PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(tensors), true, + CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); #endif // Step1: do broadcast in inner cluster auto b_opts = BroadcastOptions(); - b_opts.source_root = 0; - inner_pg_->Broadcast(tensors, b_opts); + b_opts.source_rank = 0; + inner_pg_->Broadcast(in_tensors, out_tensors, b_opts); if (local_rank_ == 0) { - std::vector cpu_tensors; - cpu_tensors.reserve(tensors.size()); - for (size_t i = 0; i < tensors.size(); i++) { - auto dense_gpu_tensor = - std::dynamic_pointer_cast(tensors[i].impl()); - phi::DenseTensorMeta meta = phi::DenseTensorMeta( - dense_gpu_tensor->dtype(), dense_gpu_tensor->dims()); - std::shared_ptr dense_cpu_tensor = - std::make_shared( - std::make_unique( - paddle::platform::CPUPlace()) - .get(), - meta); - dense_cpu_tensor->ResizeAndAllocate(dense_gpu_tensor->dims()); - cpu_tensors[i] = paddle::experimental::Tensor(dense_cpu_tensor); - framework::TensorCopySync(*dense_gpu_tensor, platform::CPUPlace(), - dense_cpu_tensor.get()); + std::vector cpu_tensors; + cpu_tensors.reserve(in_tensors.size()); + for (size_t i = 0; i < in_tensors.size(); i++) { + auto gpu_tensor = in_tensors[i]; + auto cpu_tensor = cpu_tensors[i]; + cpu_tensor.Resize(gpu_tensor.dims()); + framework::TensorCopySync(gpu_tensor, platform::CPUPlace(), &cpu_tensor); } if (with_switch_) { if (local_rank_ == 0) { HeterClient* client_ = HeterClient::GetInstance({switch_endpoint_}, {}, 0).get(); - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensors[0].impl()); + auto dense_cpu_tensor = cpu_tensors[0]; if (gloo_rank_ == 0) { std::vector send_size; - send_size.push_back(dense_cpu_tensor->numel()); + send_size.push_back(dense_cpu_tensor.numel()); int ret = client_->Send( - gid_, {dense_cpu_tensor->name()}, send_size, - dense_cpu_tensor->data(), - dense_cpu_tensor->numel() * - framework::DataTypeSize(dense_cpu_tensor->dtype())); + gid_, {dense_cpu_tensor.name()}, send_size, + dense_cpu_tensor.data(), + dense_cpu_tensor.numel() * + framework::DataTypeSize(dense_cpu_tensor.dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Send to the switch module error.")); } else { int ret = client_->Recv( - gid_, {dense_cpu_tensor->name()}, dense_cpu_tensor->data(), - dense_cpu_tensor->numel() * - framework::DataTypeSize(dense_cpu_tensor->dtype())); + gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor.data(), + dense_cpu_tensor.numel() * + framework::DataTypeSize(dense_cpu_tensor.dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Receive from the switch module error.")); ret = client_->Recv( - gid_, {dense_cpu_tensor->name()}, dense_cpu_tensor->data(), - dense_cpu_tensor->numel() * - framework::DataTypeSize(dense_cpu_tensor->dtype())); + gid_, {dense_cpu_tensor.name()}, dense_cpu_tensor.data(), + dense_cpu_tensor.numel() * + framework::DataTypeSize(dense_cpu_tensor.dtype())); PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( "Receive from the switch module error.")); } } } else { - auto gloo_task = inter_pg_->Broadcast(cpu_tensors, opts); + auto gloo_task = inter_pg_->Broadcast(cpu_tensors, cpu_tensors, opts); gloo_task->Wait(); } - for (size_t i = 0; i < tensors.size(); i++) { - auto dense_gpu_tensor = - std::dynamic_pointer_cast(tensors[i].impl()); - auto dense_cpu_tensor = - std::dynamic_pointer_cast(cpu_tensors[i].impl()); - framework::TensorCopySync(*dense_cpu_tensor, dense_cpu_tensor->place(), - dense_gpu_tensor.get()); + for (size_t i = 0; i < in_tensors.size(); i++) { + auto gpu_tensor = out_tensors[i]; + auto cpu_tensor = cpu_tensors[i]; + framework::TensorCopySync(cpu_tensor, gpu_tensor.place(), &gpu_tensor); } } - auto broadcast_task = inner_pg_->Broadcast(tensors, b_opts); + auto broadcast_task = inner_pg_->Broadcast(out_tensors, out_tensors, b_opts); broadcast_task->Wait(); - return CreateTask(rank_, CommType::BROADCAST, tensors); -} - -void ProcessGroupHeter::Broadcast(const phi::DenseTensor* in, - phi::DenseTensor* out) { - // Step1: do broadcast in inner cluster - inner_pg_->Broadcast(in, out); - - if (local_rank_ == 0) { - phi::DenseTensorMeta meta = phi::DenseTensorMeta(in->dtype(), in->dims()); - std::shared_ptr dense_cpu_tensor = - std::make_shared( - std::make_unique( - paddle::platform::CPUPlace()) - .get(), - meta); - dense_cpu_tensor->ResizeAndAllocate(in->dims()); - Tensor cpu_tensor = paddle::experimental::Tensor(dense_cpu_tensor); - framework::TensorCopySync(*in, platform::CPUPlace(), - dense_cpu_tensor.get()); - if (with_switch_) { - if (local_rank_ == 0) { - HeterClient* client_ = - HeterClient::GetInstance({switch_endpoint_}, {}, 0).get(); - if (gloo_rank_ == 0) { - std::vector send_size; - send_size.push_back(in->numel()); - int ret = client_->Send( - gid_, {in->name()}, send_size, dense_cpu_tensor->data(), - in->numel() * framework::DataTypeSize(in->dtype())); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "Send to the switch module error.")); - } else { - int ret = - client_->Recv(gid_, {in->name()}, dense_cpu_tensor->data(), - in->numel() * framework::DataTypeSize(in->dtype())); - PADDLE_ENFORCE_EQ(ret, 0, - platform::errors::PreconditionNotMet( - "Receive from the switch module error.")); - } - } - } else { - std::vector cpu_tensors = {cpu_tensor}; - auto gloo_task = inter_pg_->Broadcast(cpu_tensors); - gloo_task->Wait(); - } - framework::TensorCopySync(*dense_cpu_tensor, out->place(), out); - } - inner_pg_->Broadcast(out, out); + return CreateTask(rank_, CommType::BROADCAST, in_tensors); } -} // namespace distributed -} // namespace paddle +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHeter.h b/paddle/fluid/distributed/collective/ProcessGroupHeter.h index 892dbb9369e8d..05bacd93d7815 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupHeter.h +++ b/paddle/fluid/distributed/collective/ProcessGroupHeter.h @@ -66,7 +66,8 @@ class ProcessGroupHeter : public ProcessGroup { class HeterTask : public ProcessGroup::Task, public std::enable_shared_from_this { public: - HeterTask(int rank, CommType CommType, const std::vector& inputs); + HeterTask(int rank, CommType CommType, + const std::vector&); bool IsCompleted(); @@ -89,18 +90,16 @@ class ProcessGroupHeter : public ProcessGroup { } std::shared_ptr AllReduce( - std::vector& tensors, + std::vector&, std::vector&, const AllreduceOptions& = AllreduceOptions()) override; std::shared_ptr Broadcast( - std::vector& tensors, + std::vector&, std::vector&, const BroadcastOptions& = BroadcastOptions()) override; - void Broadcast(const phi::DenseTensor* in, phi::DenseTensor* out) override; - protected: virtual std::shared_ptr CreateTask( - int rank, CommType opType, const std::vector& inputs); + int rank, CommType opType, const std::vector& inputs); private: std::shared_ptr store_; diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index b1d892e2521a3..30813b904df53 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -41,14 +41,14 @@ void SyncDefaultStream( std::shared_ptr ProcessGroupNCCL::CreateTask( std::vector places, int rank, CommType comm_type, - const std::vector& inputs) { + const std::vector& inputs) { return std::make_shared(places, rank, comm_type, inputs); } -ProcessGroupNCCL::NCCLTask::NCCLTask(const std::vector& places, int rank, - CommType CommType, - const std::vector& inputs) +ProcessGroupNCCL::NCCLTask::NCCLTask( + const std::vector& places, int rank, CommType CommType, + const std::vector& inputs) : Task(rank, inputs, CommType), places_(places) { control_events_.resize(places.size()); ncclComms_.resize(places.size()); @@ -57,8 +57,8 @@ ProcessGroupNCCL::NCCLTask::NCCLTask(const std::vector& places, int rank, ProcessGroupNCCL::NCCLTask::~NCCLTask() {} void ProcessGroupNCCL::NCCLTask::SetOutputs( - std::vector& outputs) { // NOLINT - outputs_ = std::make_shared>(outputs); + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); } void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() { @@ -180,8 +180,8 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( template std::shared_ptr ProcessGroupNCCL::Collective( - std::vector& inputs, std::vector& outputs, Fn fn, - CommType op_type) { + std::vector& inputs, + std::vector& outputs, Fn fn, CommType op_type) { const auto places = GetPlaceList(inputs); const auto key = GetKeyFromPlaces(places); @@ -205,9 +205,7 @@ std::shared_ptr ProcessGroupNCCL::Collective( if (FLAGS_use_stream_safe_cuda_allocator) { for (size_t i = 0; i < inputs.size(); ++i) { cuda_guard.SetDevice(places[i]); - auto dense_tensor = - std::dynamic_pointer_cast(inputs[i].impl()); - memory::RecordStream(dense_tensor->Holder(), + memory::RecordStream(inputs[i].Holder(), places_to_ctx_[key][i]->stream()); } } @@ -267,7 +265,8 @@ void ProcessGroupNCCL::Collective(const phi::DenseTensor* in, template std::shared_ptr ProcessGroupNCCL::PointToPoint( - std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { + std::vector& tensors, Fn fn, int dst_rank, + CommType op_type) { const auto places = GetPlaceList(tensors); const auto key = GetKeyFromPlaces(places); @@ -290,9 +289,7 @@ std::shared_ptr ProcessGroupNCCL::PointToPoint( if (FLAGS_use_stream_safe_cuda_allocator) { for (size_t i = 0; i < tensors.size(); ++i) { cuda_guard.SetDevice(places[i]); - auto dense_tensor = - std::dynamic_pointer_cast(tensors[i].impl()); - memory::RecordStream(dense_tensor->Holder(), + memory::RecordStream(tensors[i].Holder(), places_to_ctx_[key][i]->stream()); } } @@ -314,46 +311,40 @@ std::shared_ptr ProcessGroupNCCL::PointToPoint( } std::shared_ptr ProcessGroupNCCL::AllReduce( - std::vector& tensors, const AllreduceOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const AllreduceOptions& opts) { PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(tensors), true, + CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective( - tensors, tensors, - [&](const Tensor& input, Tensor& output, ncclComm_t comm, - const gpuStream_t& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::ncclAllReduce( - input_tensor->data(), output_tensor->data(), input_tensor->numel(), - platform::ToNCCLDataType(input.type()), - ToNCCLRedType(opts.reduce_op), comm, stream); - }, - CommType::ALLREDUCE); + return Collective(in_tensors, out_tensors, + [&](const phi::DenseTensor& input, phi::DenseTensor& output, + ncclComm_t comm, const gpuStream_t& stream) { + return platform::dynload::ncclAllReduce( + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.type()), + ToNCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); } std::shared_ptr ProcessGroupNCCL::Broadcast( - std::vector& tensors, const BroadcastOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const BroadcastOptions& opts) { PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(tensors), true, + CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); - return Collective( - tensors, tensors, - [&](Tensor& input, Tensor& output, ncclComm_t comm, - const gpuStream_t& stream) { - const auto root = opts.source_rank * tensors.size() + opts.source_root; - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::ncclBcast( - input_tensor->data(), input_tensor->numel(), - platform::ToNCCLDataType(input.type()), root, comm, stream); - }, - CommType::BROADCAST); + return Collective(in_tensors, out_tensors, + [&](phi::DenseTensor& input, phi::DenseTensor& output, + ncclComm_t comm, const gpuStream_t& stream) { + const auto root = opts.source_rank * in_tensors.size() + + opts.source_root; + return platform::dynload::ncclBroadcast( + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.type()), root, comm, + stream); + }, + CommType::BROADCAST); } std::shared_ptr ProcessGroupNCCL::Barrier( @@ -374,23 +365,24 @@ std::shared_ptr ProcessGroupNCCL::Barrier( places.emplace_back(place_id); } - std::vector barrierTensors; + std::vector barrierTensors; barrierTensors.reserve(places.size()); platform::CUDADeviceGuard gpuGuard; for (auto& place : places) { gpuGuard.SetDeviceIndex(place.GetDeviceId()); auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::GPUPlace()); - barrierTensors.push_back(dt); + barrierTensors.push_back( + *std::dynamic_pointer_cast(dt.impl())); } - auto task = ProcessGroupNCCL::AllReduce(barrierTensors); + auto task = ProcessGroupNCCL::AllReduce(barrierTensors, barrierTensors); auto nccl_task = dynamic_cast(task.get()); nccl_task->barrierTensors_ = std::move(barrierTensors); return task; } -void CheckTensorsInDifferentDevices(const std::vector& tensors, - const size_t num_devices) { +void CheckTensorsInDifferentDevices( + const std::vector& tensors, const size_t num_devices) { PADDLE_ENFORCE_EQ( tensors.size() == 0, false, platform::errors::InvalidArgument("Tensor list must be nonempty.")); @@ -402,11 +394,11 @@ void CheckTensorsInDifferentDevices(const std::vector& tensors, std::set used_devices; for (const auto& t : tensors) { - PADDLE_ENFORCE_EQ(t.is_gpu() && t.is_dense_tensor(), true, + PADDLE_ENFORCE_EQ(platform::is_gpu_place(t.place()), true, platform::errors::InvalidArgument( "Tensors must be CUDA and dense tensor.")); - const auto inserted = used_devices.insert(t.inner_place()).second; + const auto inserted = used_devices.insert(t.place()).second; PADDLE_ENFORCE_EQ(inserted, true, platform::errors::InvalidArgument( "Tensors must be on distinct GPU devices.")); @@ -414,62 +406,55 @@ void CheckTensorsInDifferentDevices(const std::vector& tensors, } std::shared_ptr ProcessGroupNCCL::Send( - std::vector& tensors, int dst_rank) { + std::vector& tensors, int dst_rank) { CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); - auto task = PointToPoint( - tensors, - [&](Tensor& input, ncclComm_t comm, const gpuStream_t& stream, - int dst_rank) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - return platform::dynload::ncclSend( - input_tensor->data(), input_tensor->numel(), - platform::ToNCCLDataType(input.type()), dst_rank, comm, stream); - }, - dst_rank, CommType::SEND); + auto task = PointToPoint(tensors, + [&](phi::DenseTensor& input, ncclComm_t comm, + const gpuStream_t& stream, int dst_rank) { + return platform::dynload::ncclSend( + input.data(), input.numel(), + platform::ToNCCLDataType(input.dtype()), + dst_rank, comm, stream); + }, + dst_rank, CommType::SEND); return task; } std::shared_ptr ProcessGroupNCCL::Recv( - std::vector& tensors, int src_rank) { + std::vector& tensors, int src_rank) { CheckTensorsInDifferentDevices(tensors, static_cast(GetSize())); - auto task = PointToPoint( - tensors, - [&](Tensor& output, ncclComm_t comm, const gpuStream_t& stream, - int src_rank) { - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::ncclRecv( - output_tensor->data(), output_tensor->numel(), - platform::ToNCCLDataType(output.type()), src_rank, comm, stream); - }, - src_rank, CommType::RECV); + auto task = PointToPoint(tensors, + [&](phi::DenseTensor& output, ncclComm_t comm, + const gpuStream_t& stream, int src_rank) { + return platform::dynload::ncclRecv( + output.data(), output.numel(), + platform::ToNCCLDataType(output.dtype()), + src_rank, comm, stream); + }, + src_rank, CommType::RECV); return task; } std::shared_ptr ProcessGroupNCCL::AllGather( - std::vector& in_tensors, std::vector& out_tensors) { + std::vector& in_tensors, + std::vector& out_tensors) { PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(out_tensors), true, platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); - return Collective( - in_tensors, out_tensors, - [&](const Tensor& input, Tensor& output, ncclComm_t comm, - const gpuStream_t& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); - return platform::dynload::ncclAllGather( - input_tensor->data(), output_tensor->data(), input_tensor->numel(), - platform::ToNCCLDataType(input.type()), comm, stream); - }, - CommType::ALLGATHER); + return Collective(in_tensors, out_tensors, + [&](const phi::DenseTensor& input, phi::DenseTensor& output, + ncclComm_t comm, const gpuStream_t& stream) { + return platform::dynload::ncclAllGather( + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.dtype()), comm, + stream); + }, + CommType::ALLGATHER); } void* GetPointerByOffset(void* raw_pointer, size_t offset, @@ -493,10 +478,12 @@ void* GetPointerByOffset(void* raw_pointer, size_t offset, PADDLE_THROW(platform::errors::Unimplemented( "This datatype in nccl is not supported.")); } + return nullptr; } std::shared_ptr ProcessGroupNCCL::AllToAll( - std::vector& in_tensors, std::vector& out_tensors) { + std::vector& in_tensors, + std::vector& out_tensors) { PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); @@ -505,24 +492,20 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); return Collective( in_tensors, out_tensors, - [&](const Tensor& input, Tensor& output, ncclComm_t comm, + [&](phi::DenseTensor& input, phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); size_t offset = 0; PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (auto i = 0; i < size_; i++) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( - GetPointerByOffset(input_tensor->data(), offset, input.type()), - input_tensor->numel() / size_, - platform::ToNCCLDataType(input.type()), i, comm, stream)); + GetPointerByOffset(input.data(), offset, input.dtype()), + input.numel() / size_, platform::ToNCCLDataType(input.dtype()), i, + comm, stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - GetPointerByOffset(output_tensor->data(), offset, input.type()), - input_tensor->numel() / size_, - platform::ToNCCLDataType(input.type()), i, comm, stream)); - offset += input_tensor->numel() / size_; + GetPointerByOffset(output.data(), offset, input.dtype()), + input.numel() / size_, platform::ToNCCLDataType(input.dtype()), i, + comm, stream)); + offset += input.numel() / size_; } PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); }, @@ -530,29 +513,26 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( } std::shared_ptr ProcessGroupNCCL::Reduce( - std::vector& tensors, const ReduceOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const ReduceOptions& opts) { PADDLE_ENFORCE_EQ( - CheckTensorsInCudaPlace(tensors), true, + CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); return Collective( - tensors, tensors, - [&](const Tensor& input, Tensor& output, ncclComm_t comm, - const gpuStream_t& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); + in_tensors, out_tensors, + [&](const phi::DenseTensor& input, phi::DenseTensor& output, + ncclComm_t comm, const gpuStream_t& stream) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( - input_tensor->data(), output_tensor->data(), input.numel(), - platform::ToNCCLDataType(input.type()), + input.data(), output.data(), input.numel(), + platform::ToNCCLDataType(input.dtype()), ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream)); }, CommType::REDUCE); } std::shared_ptr ProcessGroupNCCL::Scatter( - std::vector& in_tensors, std::vector& out_tensors, - const ScatterOptions& opts) { + std::vector& in_tensors, + std::vector& out_tensors, const ScatterOptions& opts) { PADDLE_ENFORCE_EQ( CheckTensorsInCudaPlace(in_tensors), true, platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); @@ -561,31 +541,27 @@ std::shared_ptr ProcessGroupNCCL::Scatter( platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); return Collective( in_tensors, out_tensors, - [&](const Tensor& input, Tensor& output, ncclComm_t comm, + [&](phi::DenseTensor& input, phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream) { - auto input_tensor = - std::dynamic_pointer_cast(input.impl()); - auto output_tensor = - std::dynamic_pointer_cast(output.impl()); size_t offset = 0; if (rank_ == opts.root_rank) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); for (auto i = 0; i < size_; i++) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( - GetPointerByOffset(input_tensor->data(), offset, input.type()), - input_tensor->numel() / size_, - platform::ToNCCLDataType(input.type()), i, comm, stream)); - offset += input_tensor->numel() / size_; + GetPointerByOffset(input.data(), offset, input.dtype()), + input.numel() / size_, platform::ToNCCLDataType(input.dtype()), + i, comm, stream)); + offset += input.numel() / size_; } PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - output_tensor->data(), input_tensor->numel() / size_, - platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + output.data(), input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), opts.root_rank, comm, stream)); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); } else { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - output_tensor->data(), input_tensor->numel() / size_, - platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + output.data(), input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), opts.root_rank, comm, stream)); } }, diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index fa73ed195b0c1..cca84285ef4de 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -51,7 +51,7 @@ class ProcessGroupNCCL : public ProcessGroup { public std::enable_shared_from_this { public: NCCLTask(const std::vector& places, int rank, CommType CommType, - const std::vector& inputs); + const std::vector& inputs); bool IsCompleted(); @@ -61,17 +61,17 @@ class ProcessGroupNCCL : public ProcessGroup { void Synchronize(); - void SetOutputs(std::vector& outputs); // NOLINT + void SetOutputs(std::vector& outputs); // NOLINT virtual ~NCCLTask(); std::vector control_events_; - std::vector barrierTensors_; + std::vector barrierTensors_; protected: std::vector places_; std::vector> ncclComms_; - std::shared_ptr> outputs_; + std::shared_ptr> outputs_; private: }; @@ -84,40 +84,46 @@ class ProcessGroupNCCL : public ProcessGroup { } std::shared_ptr AllReduce( - std::vector& tensors, + std::vector& in_tensors, + std::vector& out_tensors, const AllreduceOptions& = AllreduceOptions()) override; std::shared_ptr Broadcast( - std::vector& tensors, + std::vector& in_tensors, + std::vector& out_tensors, const BroadcastOptions& = BroadcastOptions()) override; std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; - std::shared_ptr Send(std::vector& tensors, - int dst_rank) override; + std::shared_ptr Send( + std::vector& tensors, int dst_rank) override; - std::shared_ptr Recv(std::vector& tensors, - int src_rank) override; + std::shared_ptr Recv( + std::vector& tensors, int src_rank) override; std::shared_ptr AllGather( - std::vector& in_tensors, - std::vector& out_tensors) override; + std::vector& in_tensors, + std::vector& out_tensors) override; std::shared_ptr AllToAll( - std::vector& in, std::vector& out) override; + std::vector& in, + std::vector& out) override; std::shared_ptr Reduce( - std::vector& tensors, const ReduceOptions& opts) override; + std::vector& tensors, + std::vector& out_tensors, + const ReduceOptions& opts) override; - std::shared_ptr Scatter(std::vector& in_tensors, - std::vector& out_tensors, - const ScatterOptions&) override; + std::shared_ptr Scatter( + std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, - const std::vector& inputs); + const std::vector& inputs); protected: std::shared_ptr store_; @@ -142,8 +148,8 @@ class ProcessGroupNCCL : public ProcessGroup { template std::shared_ptr Collective( - std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT Fn fn, CommType op_type); template @@ -152,7 +158,7 @@ class ProcessGroupNCCL : public ProcessGroup { template std::shared_ptr PointToPoint( - std::vector& tensors, // NOLINT + std::vector& tensors, // NOLINT Fn fn, int dst_rank, CommType op_type); void CreateNCCLManagerCache(const std::string& places_key, diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 02f7f25636410..63e92444b32cb 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -734,7 +734,11 @@ void EagerReducer::ProcessUnusedDenseVars() { distributed::AllreduceOptions opts; opts.reduce_op = ReduceOp::SUM; std::vector reduce_tensors = {global_used_vars_}; - process_group_->AllReduce(reduce_tensors, opts)->Synchronize(); + std::vector in_out; + for (auto &t : reduce_tensors) { + in_out.push_back(*std::dynamic_pointer_cast(t.impl())); + } + process_group_->AllReduce(in_out, in_out, opts)->Synchronize(); framework::TensorToVector(*global_used_tensor, *dev_ctx, &local_used_vars_); @@ -820,7 +824,11 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, // all_reduce std::vector reduce_tensors = {group->dense_contents_}; - group->task = process_group_->AllReduce(reduce_tensors, opts); + std::vector in_out; + for (auto &t : reduce_tensors) { + in_out.push_back(*std::dynamic_pointer_cast(t.impl())); + } + group->task = process_group_->AllReduce(in_out, in_out, opts); // split in FinalizeBackward() } @@ -871,7 +879,11 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, distributed::AllreduceOptions opts; opts.reduce_op = ReduceOp::SUM; std::vector reduce_tensors = {rows_num_tensor}; - process_group_->AllReduce(reduce_tensors, opts)->Synchronize(); + std::vector in_out; + for (auto &t : reduce_tensors) { + in_out.push_back(*std::dynamic_pointer_cast(t.impl())); + } + process_group_->AllReduce(in_out, in_out, opts)->Synchronize(); framework::TensorToVector(*rows_num_dense_tensor, *dev_ctx, &rows_num_vector); @@ -908,8 +920,15 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, std::vector src_rows_tensors = {src_rows_tensor}; std::vector dst_rows_tensors = {dst_rows_tensor}; - process_group_->AllGather(src_rows_tensors, dst_rows_tensors) - ->Synchronize(); + std::vector in; + std::vector out; + for (auto &t : src_rows_tensors) { + in.push_back(*std::dynamic_pointer_cast(t.impl())); + } + for (auto &t : dst_rows_tensors) { + out.push_back(*std::dynamic_pointer_cast(t.impl())); + } + process_group_->AllGather(in, out)->Synchronize(); framework::Vector dst_rows_vector(rows_num, 0); auto *dst_rows_dense_tensor = @@ -934,8 +953,17 @@ void EagerReducer::AllReduceSparse(EagerGroup *group, std::vector src_value_tensors = {src_value_tensor}; std::vector dst_value_tensors = {dst_value_tensor}; - process_group_->AllGather(src_value_tensors, dst_value_tensors) - ->Synchronize(); + std::vector src_dense; + std::vector dst_dense; + for (auto &t : src_value_tensors) { + src_dense.push_back( + *std::dynamic_pointer_cast(t.impl())); + } + for (auto &t : dst_value_tensors) { + dst_dense.push_back( + *std::dynamic_pointer_cast(t.impl())); + } + process_group_->AllGather(src_dense, dst_dense)->Synchronize(); src->set_rows(dst_rows_vector); *(src->mutable_value()) = diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index de44a833f6e73..3ed17b67b842a 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -2011,8 +2011,7 @@ static std::string GenerateSingleOpBase( "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::" "RecoverTensorWrapper(" "&" - "this->%s, " - "nullptr)) },"; + "this->%s)) },"; ins_contents_str += paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name, struct_fwd_input_name); @@ -2058,15 +2057,15 @@ static std::string GenerateSingleOpBase( const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = " if(this->%s.size() > 0) %s[\"%s\"] = " "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::" - "RecoverTensorWrapper(&this->%s, nullptr));\n"; + "RecoverTensorWrapper(&this->%s));\n"; generated_grad_function_body += paddle::string::Sprintf( DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name, ins_name, grad_input_name, struct_fwd_input_name); } else { const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = - " auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s, " - "nullptr);\n if(%s.initialized()) %s[\"%s\"] = " - "egr::EagerUtils::TrySyncToVars(%s);\n"; + " auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);\n" + " if(%s.initialized()) %s[\"%s\"] = " + " egr::EagerUtils::TrySyncToVars(%s);\n"; generated_grad_function_body += paddle::string::Sprintf( DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name, struct_fwd_input_name, grad_input_name, ins_name, grad_input_name, diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 0081dbb595df3..ea7b4a21a2c54 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -23,7 +23,8 @@ ######################## ops_to_fill_zero_for_empty_grads = set([ "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad", - "sigmoid_triple_grad, add_double_grad" + "sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad", + "add_triple_grad" ]) # For API dispatch used at python-level diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index be6dda270093b..d6505ebaa1e68 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -236,7 +236,7 @@ class {} : public egr::GradNodeBase {{ {} // SetAttributes {} - // SetTensorWrappers + // Set TensorWrappers for Forward Inputs {} // SetGradOutMeta & SetEdges {} @@ -245,6 +245,8 @@ class {} : public egr::GradNodeBase {{ {} {} {} +{} + // Set TensorWrappers for Forward Outputs {} }} """ @@ -720,7 +722,8 @@ def GenerateNodeCreationCodes(self): set_attributes_str = "\n".join(set_attributes_list) # SetTensorWrappers - set_tensor_wrappers_list = [] + set_input_tensor_wrappers_list = [] + set_output_tensor_wrappers_list = [] num_fwd_outputs = len(forward_outputs_position_map.keys()) for name, (atype, is_fwd_input, pos) in backward_forward_inputs_map.items(): @@ -732,6 +735,7 @@ def GenerateNodeCreationCodes(self): set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), true);" else: set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, {need_input_data});" + set_input_tensor_wrappers_list.append(set_tensor_wrappers) else: if num_fwd_outputs > 1: # Aligned with forward output position @@ -743,8 +747,11 @@ def GenerateNodeCreationCodes(self): set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), false);" else: set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, false);" - set_tensor_wrappers_list.append(set_tensor_wrappers) - set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) + set_output_tensor_wrappers_list.append(set_tensor_wrappers) + set_input_tensor_wrappers_str = "\n".join( + set_input_tensor_wrappers_list) + set_output_tensor_wrappers_str = "\n".join( + set_output_tensor_wrappers_list) # SetGradOutMeta & SetEdges set_grad_out_meta_list = [] @@ -801,9 +808,10 @@ def GenerateNodeCreationCodes(self): self.node_creation_str = FORWARD_BODY_TEMPLATE.format( node_creation_event_str, pass_stop_gradient_args_str, - node_construction_str, set_attributes_str, set_tensor_wrappers_str, - set_grad_out_meta_str, set_edges_str, set_out_rank_str, - set_history_str, set_grad_in_meta_str, set_retain_grad_str) + node_construction_str, set_attributes_str, + set_input_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str, + set_out_rank_str, set_history_str, set_grad_in_meta_str, + set_retain_grad_str, set_output_tensor_wrappers_str) def run(self): # Basic Validation Check @@ -1296,7 +1304,7 @@ def GenerateNodeDefinition(self, grad_node_creation_str): transformed_tensor_name = self.TransformToNextGradName(name) is_optional = (name in self.optional_inputs) - tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, this->shared_from_this());" + tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});" if is_optional: tensor_wrapper_recover_str += "\n" + CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE.format( transformed_tensor_name, transformed_tensor_name, diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 60c5e52767a00..974acb8646ca5 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -731,16 +731,6 @@ std::vector RunBackward( continue; } - auto* next_node = next_node_shared.get(); - if (!node_input_buffers_dict.count(next_node)) { - const auto& input_meta = next_node->InputMeta(); - auto grad_tensor_holder = - std::make_unique(input_meta); - VLOG(6) << "Construct GradTensorHolder for grad node: " - << next_node->name(); - node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); - } - PADDLE_ENFORCE_LT( j, grad_output_tensors[i].size(), paddle::platform::errors::Fatal( @@ -760,8 +750,19 @@ std::vector RunBackward( << ", rank: " << j << " 's name is: " << grad_output_tensor.name(); + auto* next_node = next_node_shared.get(); + if (!node_input_buffers_dict.count(next_node)) { + const auto& input_meta = next_node->InputMeta(); + auto grad_tensor_holder = + std::make_unique(input_meta); + VLOG(6) << "Construct GradTensorHolder for grad node: " + << next_node->name(); + node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); + } + VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first << ", rank: " << edge_rank.second; + node_input_buffers_dict[next_node]->add( edge_rank.first, edge_rank.second, grad_output_tensor); diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h index c483dc0ebd177..6db410fa0f1af 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.h +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h @@ -59,7 +59,7 @@ class RunCustomOpNode : public GradNodeBase { std::vector* fwd_var) { std::vector res; for (size_t i = 0; i < fwd_var->size(); i++) { - res.emplace_back(fwd_var->at(i).recover(nullptr)); + res.emplace_back(fwd_var->at(i).recover()); } return res; } diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 23c7ea7c5e9b4..6afdd854344eb 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -61,6 +61,10 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); } + VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " + << this->name() << " (addr: " << this << ") " + << " to " << meta->GetMutableGradNode()->name() + << " (addr: " << meta->GetMutableGradNode().get() << ")"; adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); @@ -84,7 +88,9 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { meta->SetGradNode(std::make_shared(meta)); } VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); + << this->name() << " (addr: " << this << ") " + << " to " << meta->GetMutableGradNode()->name() + << " (addr: " << meta->GetMutableGradNode().get() << ")"; adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 2dacb588ff847..183282d6f87b2 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -110,6 +110,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, "got tensor: %s is empty please check you network " "and make sure it creates grads.", t.name())); + if (t.is_dense_tensor()) { if (buffer_tensor.is_dense_tensor()) { buffer_tensor = add_final_state_dygraph_function(t, buffer_tensor); diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 3d5d3139de14c..b5dd6b960b23a 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -77,16 +77,17 @@ class TensorWrapper { intermidiate_tensor_.set_name(tensor.name() + "@Saved"); - // If an output is marked "intermedaite", we won't create - // autograd_meta for it. - // In that case, simply skip OutRankInfo Copy - if (EagerUtils::nullable_autograd_meta(tensor)) { - out_rank_info_ = EagerUtils::OutRankInfo(tensor); + auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor); + if (tensor_autograd_meta) { + auto autograd_meta = std::make_shared( + Edge(nullptr, EagerUtils::OutRankInfo(tensor))); + autograd_meta->SetStopGradient(tensor_autograd_meta->StopGradient()); + intermidiate_tensor_.set_autograd_meta(autograd_meta); + weak_grad_node_ = tensor_autograd_meta->GetMutableGradNode(); } } - paddle::experimental::Tensor recover( - const std::shared_ptr& grad_node) { + paddle::experimental::Tensor recover() { VLOG(6) << "Recover tensor: " << intermidiate_tensor_.name() << " for wrapper"; if (!intermidiate_tensor_.defined()) { @@ -99,9 +100,20 @@ class TensorWrapper { // if it's full_reserved just return the full copy of tensor paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_; if (!full_reserved_) { - std::shared_ptr new_grad_node = grad_node; - auto p_ab_autograd_meta = - std::make_shared(Edge(new_grad_node, out_rank_info_)); + std::shared_ptr new_grad_node = weak_grad_node_.lock(); + if (new_grad_node) { + VLOG(3) << "Recovered TensorWrapper with GradNode " + << new_grad_node->name() << " addr: " << new_grad_node.get(); + } else { + VLOG(3) << "Recovered TensorWrapper with Empth GradNode"; + } + auto* intermediate_autograd_meta = + EagerUtils::unsafe_autograd_meta(intermidiate_tensor_); + auto p_ab_autograd_meta = std::make_shared( + Edge(new_grad_node, intermediate_autograd_meta->OutRankInfo())); + p_ab_autograd_meta->SetStopGradient( + intermediate_autograd_meta->StopGradient()); + recovered_tensor.set_autograd_meta( std::static_pointer_cast( p_ab_autograd_meta)); @@ -149,8 +161,8 @@ class TensorWrapper { private: bool full_reserved_ = false; bool no_need_buffer_ = false; - std::pair out_rank_info_; paddle::experimental::Tensor intermidiate_tensor_; + std::weak_ptr weak_grad_node_; uint32_t inplace_version_snapshot_ = 0; }; } // namespace egr diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc index a0c75c0200137..5f563edee39f1 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc @@ -41,7 +41,7 @@ TEST(TensorWrapper, Basic) { et1.set_autograd_meta(auto_grad0); et1.set_name("et1"); auto tw0 = egr::TensorWrapper(et1, true); - auto recover_et1 = tw0.recover(std::make_shared()); + auto recover_et1 = tw0.recover(); CHECK_EQ(recover_et1.name(), std::string("et1")); CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et1).first, egr::EagerUtils::OutRankInfo(et1).first); @@ -67,7 +67,7 @@ TEST(TensorWrapper, Basic) { auto auto_grad1 = std::make_shared(edge1); et2.set_autograd_meta(auto_grad1); auto tw1 = egr::TensorWrapper(et2, false); - auto recover_et2 = tw1.recover(grad_test_node1); + auto recover_et2 = tw1.recover(); CHECK_EQ(recover_et2.name(), std::string("et2@Saved")); CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et2).first, egr::EagerUtils::OutRankInfo(et2).first); @@ -76,7 +76,5 @@ TEST(TensorWrapper, Basic) { // Test Raw recover paddle::experimental::Tensor et3; auto tw2 = egr::TensorWrapper(et3, true); - CHECK( - tw2.recover(std::make_shared()).initialized() == - false); + CHECK(tw2.recover().initialized() == false); } diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index bcf4a4627bb76..756563df4dfe7 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -360,16 +360,15 @@ void EagerUtils::Output2Result( } paddle::experimental::Tensor EagerUtils::RecoverTensorWrapper( - TensorWrapper* tw, const std::shared_ptr& grad_node) { - return tw->recover(grad_node); + TensorWrapper* tw) { + return tw->recover(); } std::vector EagerUtils::RecoverTensorWrapper( - std::vector* tw, - const std::shared_ptr& grad_node) { + std::vector* tw) { std::vector ret; for (auto& t : *tw) { - ret.emplace_back(t.recover(grad_node)); + ret.emplace_back(t.recover()); } return ret; } diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index be534d4440561..51a322c8524ac 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -174,11 +174,9 @@ class EagerUtils { const std::shared_ptr& view_output_var); // TensorWrapper Utils - static paddle::experimental::Tensor RecoverTensorWrapper( - TensorWrapper* tw, const std::shared_ptr& grad_node); + static paddle::experimental::Tensor RecoverTensorWrapper(TensorWrapper* tw); static std::vector RecoverTensorWrapper( - std::vector* tw, - const std::shared_ptr& grad_node); + std::vector* tw); // Intermidate needed remove this once we don't need legacy // Inner Method diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d37e4a468cac0..99d3f790e253c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1755,6 +1755,7 @@ USE_TRT_CONVERTER(deformable_conv); USE_TRT_CONVERTER(pool3d) USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm) USE_TRT_CONVERTER(preln_skip_layernorm) +USE_TRT_CONVERTER(roll) USE_TRT_CONVERTER(strided_slice) #endif diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index f1800afcb1d26..ec8c1b2fcd75c 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -25,6 +25,7 @@ nv_library(tensorrt_converter preln_emb_eltwise_layernorm.cc strided_slice_op.cc preln_skip_layernorm.cc + roll_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/roll_op.cc b/paddle/fluid/inference/tensorrt/convert/roll_op.cc new file mode 100644 index 0000000000000..407f43d58678e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/roll_op.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/helper.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { +/* + * Stack converter from fluid to tensorRT. + */ +class RollOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert fluid Roll op to tensorrt Slice layer"; + + framework::OpDesc op_desc(op, nullptr); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + nvinfer1::Dims input_dims = input->getDimensions(); + + std::vector axis = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("axis")); + std::vector shifts = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("shifts")); + + nvinfer1::Dims start; + start.nbDims = input_dims.nbDims; + for (int i = 0; i < start.nbDims; i++) { + start.d[i] = 0; + } + int axis_size = axis.size(); + for (int i = 0; i < axis_size; i++) { + start.d[axis[i]] = (-shifts[i]) % input_dims.d[axis[i]]; + } + + nvinfer1::Dims stride; + stride.nbDims = input_dims.nbDims; + for (int i = 0; i < stride.nbDims; i++) { + stride.d[i] = 1; + } + + nvinfer1::Dims size; + size.nbDims = input_dims.nbDims; + for (int i = 0; i < size.nbDims; i++) { + size.d[i] = 1; + } + + auto output_name = op_desc.Output("Out")[0]; + + auto shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input); + + auto* layer = + TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride); + layer->setInput(2, *shape_layer->getOutput(0)); +#if IS_TRT_VERSION_GE(7000) + layer->setMode(nvinfer1::SliceMode::kWRAP); +#endif + + RreplenishLayerAndOutput(layer, "roll", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(roll, RollOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index d9a874dd2b629..b44450e7a8212 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -119,6 +119,7 @@ struct SimpleOpTypeSetTeller : public Teller { "slice", "strided_slice", "fused_preln_embedding_eltwise_layernorm", + "roll", "preln_skip_layernorm"}; std::unordered_set teller_set{ "mul", @@ -182,6 +183,7 @@ struct SimpleOpTypeSetTeller : public Teller { "strided_slice", "fused_preln_embedding_eltwise_layernorm", "preln_skip_layernorm", + "roll", "multiclass_nms3"}; }; @@ -928,6 +930,28 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "roll") { +#if !IS_TRT_VERSION_GE(7000) + VLOG(3) << "roll converter does not support trt versions below 7.0"; + return false; +#endif + if (!with_dynamic_shape) { + return false; + } + } + + if (op_type == "strided_slice") { + if (!with_dynamic_shape) { + return false; + } + if (!desc.HasAttr("axes") || !desc.HasAttr("starts") || + !desc.HasAttr("ends") || !desc.HasAttr("strides")) { + VLOG(3) + << "The necessary attributes of the strided_slice operator miss "; + return false; + } + } + if (op_type == "slice") { if (desc.HasAttr("decrease_axis")) { std::vector decrease_axis = diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index 89854999c16fc..0d97ffa96dc5c 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -18,7 +18,9 @@ limitations under the License. */ #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif +#include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/framework/convert_utils.h" +#include "paddle/phi/api/include/tensor.h" namespace paddle { namespace operators { @@ -35,6 +37,18 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel { int nranks = ctx.Attr("nranks"); int rid = ctx.Attr("ring_id"); + auto map = distributed::ProcessGroupMapFromGid::getInstance(); + if (map->has(rid)) { + // Use ProcessGroup + distributed::ProcessGroup* pg = map->get(rid); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(*in); + out_tensor.push_back(*out); + auto task = pg->AllGather(in_tensor, out_tensor); + task->Wait(); + return; + } auto place = ctx.GetPlace(); auto comm = platform::NCCLCommContext::Instance().Get(rid, place); PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index 7bdf5f0c46ca6..4bed282ace8d1 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -41,7 +41,12 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { if (map->has(rid)) { // Use ProcessGroup distributed::ProcessGroup* pg = map->get(rid); - pg->Broadcast(x, out); + std::vector in_tensor; + std::vector out_tensor; + in_tensor.push_back(*x); + out_tensor.push_back(*out); + auto task = pg->Broadcast(in_tensor, out_tensor); + task->Wait(); return; } diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc index 981cd11035129..292f706cb186b 100644 --- a/paddle/fluid/operators/expand_v2_op.cc +++ b/paddle/fluid/operators/expand_v2_op.cc @@ -16,7 +16,11 @@ limitations under the License. */ #include #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" #define MAX_RANK_SUPPORTED 6 @@ -29,70 +33,6 @@ class ExpandV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandV2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ExpandV2"); - auto x_dims = ctx->GetInputDim("X"); - auto expand_shape = ctx->Attrs().Get>("shape"); - - if (expand_shape.size() == 0) { - expand_shape = std::vector(x_dims.size(), -1); - } - - PADDLE_ENFORCE_GE( - expand_shape.size(), static_cast(x_dims.size()), - platform::errors::InvalidArgument( - "The number of elements (%d) of 'shape' for " - "expand_v2 op must be greater than or equal to the rank " - "(%d) of the input.", - expand_shape.size(), static_cast(x_dims.size()))); - PADDLE_ENFORCE_LE(expand_shape.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number of elements (%d) of 'shape' for " - "must not be greater than %d.", - expand_shape.size(), MAX_RANK_SUPPORTED)); - PADDLE_ENFORCE_GE(expand_shape.size(), 1, - platform::errors::InvalidArgument( - "The number of elements (%d) of 'shape' for " - "must be a positive integer.", - expand_shape.size())); - - auto out_rank = - std::max(static_cast(x_dims.size()), expand_shape.size()); - std::vector out_shape(out_rank); - auto x_dim_vec = phi::vectorize(x_dims); - auto diff = expand_shape.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); - for (size_t i = 0; i < expand_shape.size(); ++i) { - if (x_dims[i] == -1) { - out_shape[i] = -1; - } else if (expand_shape[i] == -1) { - if (static_cast(x_dims.size()) > i) { - out_shape[i] = x_dims[i]; - } else { - out_shape[i] = -1; - } - } else if (expand_shape[i] == -2) { - // We use -2 to represent the element in expand_shape is a var. - out_shape[i] = -1; - } else { - PADDLE_ENFORCE_GT( - expand_shape[i], 0, - platform::errors::InvalidArgument( - "The %uth element of 'shape' for expand_v2 op must be " - "greater than 0, but the value given is %d.", - i, expand_shape[i])); - out_shape[i] = expand_shape[i]; - } - } - - ctx->SetOutputDim("Out", phi::make_ddim(out_shape)); - if (out_shape[0] == x_dims[0]) { - ctx->ShareLoD("X", "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -291,10 +231,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X"); } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(expand_v2, ExpandInferShapeFunctor, + PD_INFER_META(phi::ExpandInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker, ops::ExpandV2GradOpMaker, - ops::ExpandV2GradOpMaker); + ops::ExpandV2GradOpMaker, + ExpandInferShapeFunctor); REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp, ops::ExpandV2DoubleGradOpMaker, ops::ExpandV2DoubleGradOpMaker, diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index f51d776d7195c..a2af131cb505e 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -126,7 +126,7 @@ This operator implements one step of the stochastic gradient descent algorithm. namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(sgd, SGDInferShapeFunctor, - PD_INFER_META(phi::SGDInferMeta)); + PD_INFER_META(phi::SgdInferMeta)); REGISTER_OPERATOR( sgd, ops::SGDOp, ops::SGDOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index f29546c5210d9..46059100b3802 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -192,13 +192,13 @@ add_subdirectory(profiler) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler stats) + nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler) nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) elseif(WITH_ROCM) - hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler stats) + hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler) hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) else() - cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler stats) + cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler) cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) endif() diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 38ed1d4f2bb5d..716cd35f0a614 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -115,8 +115,10 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); distributed::AllreduceOptions opts; opts.reduce_op = op; - std::vector tensors = {tensor}; - return self.AllReduce(tensors, opts); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.AllReduce(tensors, tensors, opts); }, py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM, py::call_guard()) @@ -127,8 +129,10 @@ void BindDistributed(py::module *m) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); distributed::BroadcastOptions opts; opts.source_rank = source_rank; - std::vector tensors = {tensor}; - return self.Broadcast(tensors, opts); + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; + return self.Broadcast(tensors, tensors, opts); }, py::arg("tensor"), py::arg("source_rank"), py::call_guard()) @@ -146,7 +150,9 @@ void BindDistributed(py::module *m) { [](distributed::ProcessGroup &self, py::handle py_tensor, int dst) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); - std::vector tensors = {tensor}; + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; return self.Send(tensors, dst); }, py::arg("tensor"), py::arg("dst"), @@ -156,7 +162,9 @@ void BindDistributed(py::module *m) { [](distributed::ProcessGroup &self, py::handle py_tensor, int src) { auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); - std::vector tensors = {tensor}; + auto dense = + std::dynamic_pointer_cast(tensor.impl()); + std::vector tensors = {*dense}; return self.Recv(tensors, src); }, py::arg("tensor"), py::arg("src"), @@ -167,8 +175,12 @@ void BindDistributed(py::module *m) { py::handle py_out_tensor) { auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - std::vector in_tensors = {in_tensor}; - std::vector out_tensors = {out_tensor}; + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; return self.AllGather(in_tensors, out_tensors); }, py::arg("in"), py::arg("out"), @@ -179,8 +191,12 @@ void BindDistributed(py::module *m) { py::handle py_out_tensor) { auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0); auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); - std::vector in_tensors = {in_tensor}; - std::vector out_tensors = {out_tensor}; + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; return self.AllToAll(in_tensors, out_tensors); }, py::arg("in"), py::arg("out"), @@ -193,8 +209,10 @@ void BindDistributed(py::module *m) { distributed::ReduceOptions opts; opts.reduce_op = op; opts.root_rank = dst; - std::vector tensors = {in_tensor}; - return self.Reduce(tensors, opts); + auto dense = std::dynamic_pointer_cast( + in_tensor.impl()); + std::vector tensors = {*dense}; + return self.Reduce(tensors, tensors, opts); }, py::arg("tensor"), py::arg("dst"), py::arg("op") = distributed::ReduceOp::SUM, @@ -207,8 +225,12 @@ void BindDistributed(py::module *m) { auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0); distributed::ScatterOptions opts; opts.root_rank = src; - std::vector in_tensors = {in_tensor}; - std::vector out_tensors = {out_tensor}; + auto in_dense = std::dynamic_pointer_cast( + in_tensor.impl()); + auto out_dense = std::dynamic_pointer_cast( + out_tensor.impl()); + std::vector in_tensors = {*in_dense}; + std::vector out_tensors = {*out_dense}; return self.Scatter(in_tensors, out_tensors, opts); }, py::arg("in"), py::arg("out"), py::arg("src"), diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index d43e327393f25..724b1ba556d4b 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -23,7 +23,7 @@ add_subdirectory(tools) add_subdirectory(tests) # make an unity target for compile deps -set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar) +set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor) get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index e10ae8254a79e..9cc5d620280bc 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -164,7 +164,7 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_conte cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform) -cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy) +cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl) cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl global_utils) @@ -173,5 +173,3 @@ cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw p cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform phi_function_api sparse_api) cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api) -cc_library(tensor_copy SRCS tensor_copy.cc DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils) -cc_library(api_scalar SRCS scalar.cc DEPS tensor_copy) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 81e7faeb87015..2b80094a39e31 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_gen_utils.h" #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" -#include "paddle/phi/api/lib/tensor_copy.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/compat/convert_utils.h" @@ -425,8 +424,35 @@ std::vector> conv2d_grad_impl( } Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) { + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + kernel_key_set.backend_set = + kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + "copy", kernel_key); + + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + + auto dense_x = TensorToDenseTensor(x); + Tensor out; - copy(x, place, blocking, &out); + auto kernel_out = SetKernelOutput(kernel_key.backend(), &out); + phi::MetaTensor meta_out(kernel_out); + phi::UnchangedInferMeta(*dense_x, &meta_out); + + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + phi::Place, + bool, + phi::DenseTensor*); + + auto* kernel_fn = kernel.GetVariadicKernelFn(); + + (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); + return out; } @@ -630,6 +656,176 @@ std::tuple momentum_impl( return api_output; } +std::tuple sgd_impl( + const Tensor& param, + const Tensor& learning_rate, + const Tensor& grad, + paddle::optional master_param, + bool multi_precision) { + DataType kernel_data_type = ParseDataType(param); + auto kernel_key_set = ParseKernelKeyByInputArgs(param, learning_rate, grad); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + VLOG(6) << "sgd API kernel key: [" << kernel_key.backend() << ", " + << kernel_key.layout() << ", " << kernel_data_type << "]"; + + const auto& param_tensor = param.impl(); + std::string kernel_name = "sgd"; + if (phi::DenseTensor::classof(param_tensor.get())) { + if (!phi::DenseTensor::classof(grad.impl().get())) { + kernel_name = "sgd_dense_param_sparse_grad"; + } + } else { + kernel_name = "sgd_sparse_param_sparse_grad"; + } + const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + kernel_name, + {kernel_key.backend(), kernel_key.layout(), kernel_data_type}); + VLOG(6) << kernel_name << " API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + + auto in_learning_rate = + PrepareData(learning_rate, kernel.InputAt(1), {false, true, true, true}); + + std::tuple out; + std::get<0>(out) = param; + if (master_param) { + std::get<1>(out) = *master_param; + } + phi::MetaTensor meta_out_0(std::get<0>(out).impl().get()); + phi::MetaTensor meta_out_1(master_param ? std::get<1>(out).impl().get() + : nullptr); + + if (phi::DenseTensor::classof(param_tensor.get())) { + auto in_param = PrepareData(param, kernel.InputAt(0), {}); + auto in_master_param = PrepareData(master_param, kernel.InputAt(3), {}); + + paddle::optional in_master_param_opt = + master_param + ? paddle::make_optional(*in_master_param) + : paddle::none; + auto master_param_meta = MakeMetaTensor(in_master_param_opt); + paddle::optional master_param_meta_opt = + master_param + ? paddle::make_optional(*master_param_meta) + : paddle::none; + + phi::DenseTensor* kernel_out_0 = + SetKernelOutput(kernel_key.backend(), &std::get<0>(out)); + phi::DenseTensor* kernel_out_1 = + master_param + ? static_cast(std::get<1>(out).impl().get()) + : nullptr; + + if (phi::DenseTensor::classof(grad.impl().get())) { + auto in_grad = PrepareData(grad, kernel.InputAt(2), {}); + SgdInferMeta(MakeMetaTensor(*in_param), + MakeMetaTensor(*in_learning_rate), + MakeMetaTensor(*in_grad), + master_param_meta_opt, + multi_precision, + &meta_out_0, + &meta_out_1); + + using kernel_signature = + void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + paddle::optional, + bool, + phi::DenseTensor*, + phi::DenseTensor*); + + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + *in_param, + *in_learning_rate, + *in_grad, + in_master_param_opt, + multi_precision, + kernel_out_0, + kernel_out_1); + } else { + auto in_grad = TensorToSelectedRows(grad); + SgdInferMeta(MakeMetaTensor(*in_param), + MakeMetaTensor(*in_learning_rate), + MakeMetaTensor(*in_grad), + master_param_meta_opt, + multi_precision, + &meta_out_0, + &meta_out_1); + + using kernel_signature = + void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::SelectedRows&, + paddle::optional, + bool, + phi::DenseTensor*, + phi::DenseTensor*); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + *in_param, + *in_learning_rate, + *in_grad, + in_master_param_opt, + multi_precision, + kernel_out_0, + kernel_out_1); + } + } else { + auto in_param = TensorToSelectedRows(param); + auto in_grad = TensorToSelectedRows(grad); + auto in_master_param = TensorToSelectedRows(master_param); + auto in_master_param_opt = + master_param + ? paddle::make_optional(*in_master_param) + : paddle::none; + auto master_param_meta = MakeMetaTensor(in_master_param_opt); + paddle::optional master_param_meta_opt = + master_param + ? paddle::make_optional(*master_param_meta) + : paddle::none; + + phi::SelectedRows* kernel_out_0 = + SetSelectedRowsKernelOutput(kernel_key.backend(), &std::get<0>(out)); + phi::SelectedRows* kernel_out_1 = + master_param + ? static_cast(std::get<1>(out).impl().get()) + : nullptr; + + SgdInferMeta(MakeMetaTensor(*in_param), + MakeMetaTensor(*in_learning_rate), + MakeMetaTensor(*in_grad), + master_param_meta_opt, + multi_precision, + &meta_out_0, + &meta_out_1); + + using kernel_signature = + void (*)(const platform::DeviceContext&, + const phi::SelectedRows&, + const phi::DenseTensor&, + const phi::SelectedRows&, + paddle::optional, + bool, + phi::SelectedRows*, + phi::SelectedRows*); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)(*dev_ctx, + *in_param, + *in_learning_rate, + *in_grad, + in_master_param_opt, + multi_precision, + kernel_out_0, + kernel_out_1); + } + return out; +} + ////////////////// Backward(grad) api impls ////////////////////// // TODO(chenweihang): the original sum grad op can support higher-level diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 5d46ed691816b..4ddc3e5f4e0d2 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/phi/api/include/tensor.h" @@ -107,6 +108,13 @@ std::tuple momentum_impl( bool multi_precision, float rescale_grad); +std::tuple sgd_impl( + const Tensor& param, + const Tensor& learning_rate, + const Tensor& grad, + paddle::optional master_param, + bool multi_precision); + ////////////////// Backward(grad) api impls ////////////////////// std::vector add_n_grad_impl(const std::vector& x, diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index f9db152956923..e0c910ba3d66c 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -20,13 +20,13 @@ namespace experimental { /* ------------------ for input ----------------------- */ std::shared_ptr TensorToDenseTensor(const Tensor& tensor) { - return std::dynamic_pointer_cast(tensor.impl()); + return std::static_pointer_cast(tensor.impl()); } std::shared_ptr TensorToDenseTensor( - const paddle::optional& tensor) { + const paddle::optional& tensor) { if (tensor) { - return std::dynamic_pointer_cast(tensor->impl()); + return std::static_pointer_cast(tensor->impl()); } return nullptr; } @@ -45,13 +45,13 @@ std::unique_ptr> TensorToDenseTensor( } std::shared_ptr TensorToSelectedRows(const Tensor& tensor) { - return std::dynamic_pointer_cast(tensor.impl()); + return std::static_pointer_cast(tensor.impl()); } std::shared_ptr TensorToSelectedRows( - const paddle::optional& tensor) { + const paddle::optional& tensor) { if (tensor) { - return std::dynamic_pointer_cast(tensor->impl()); + return std::static_pointer_cast(tensor->impl()); } return nullptr; } @@ -66,6 +66,14 @@ phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) { return phi::MetaTensor(tensor); } +paddle::optional MakeMetaTensor( + const paddle::optional& tensor) { + if (tensor) { + return {phi::MetaTensor(*tensor)}; + } + return {paddle::none}; +} + std::vector MakeMetaTensor( const std::vector& tensors) { std::vector meta_tensors; @@ -90,6 +98,14 @@ phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) { return phi::MetaTensor(tensor); } +paddle::optional MakeMetaTensor( + const paddle::optional& tensor) { + if (tensor) { + return {phi::MetaTensor(*tensor)}; + } + return {paddle::none}; +} + phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) { return phi::MetaTensor(tensor); } diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h index 035dfc5204720..47b80bb3fc290 100644 --- a/paddle/phi/api/lib/api_gen_utils.h +++ b/paddle/phi/api/lib/api_gen_utils.h @@ -42,7 +42,7 @@ std::unique_ptr> TensorToDenseTensor( std::shared_ptr TensorToSelectedRows(const Tensor& tensor); std::shared_ptr TensorToSelectedRows( - const paddle::optional& tensor); + const paddle::optional& tensor); std::shared_ptr TensorToStringTensor(const Tensor& tensor); @@ -50,6 +50,9 @@ std::shared_ptr TensorToStringTensor(const Tensor& tensor); phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor); +paddle::optional MakeMetaTensor( + const paddle::optional& tensor); + std::vector MakeMetaTensor( const std::vector& tensors); @@ -58,6 +61,9 @@ std::vector MakeMetaTensor( phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor); +paddle::optional MakeMetaTensor( + const paddle::optional& tensor); + phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor); /* ------------------ for output ----------------------- */ diff --git a/paddle/phi/api/lib/scalar.cc b/paddle/phi/api/lib/scalar.cc deleted file mode 100644 index 981487df86be4..0000000000000 --- a/paddle/phi/api/lib/scalar.cc +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/common/scalar.h" - -#include "paddle/phi/api/lib/tensor_copy.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/core/enforce.h" - -namespace paddle { -namespace experimental { - -template <> -ScalarBase::ScalarBase(const Tensor& tensor_in) - : dtype_(tensor_in.dtype()) { // NOLINT - PADDLE_ENFORCE_EQ(tensor_in.numel(), - 1, - phi::errors::InvalidArgument( - "The Scalar only supports Tensor with 1 element, but " - "now Tensor has `%d` elements", - tensor_in.numel())); - if (tensor_in.place() == PlaceType::kGPU) { - Tensor dst_tensor; - copy(tensor_in, phi::CPUPlace(), true, &dst_tensor); - GetDataFromTensor(dst_tensor); - } else if (tensor_in.place() == PlaceType::kCPU) { - GetDataFromTensor(tensor_in); - } else { - PADDLE_THROW(phi::errors::Unimplemented( - "Now, it is not supported to construct Scalar using tensor that its " - "PlaceType is (%d)", - static_cast(tensor_in.place()))); - } -} - -} // namespace experimental -} // namespace paddle diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc deleted file mode 100644 index 57e3c28d8cb1f..0000000000000 --- a/paddle/phi/api/lib/tensor_copy.cc +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/api/lib/tensor_copy.h" -#include "paddle/phi/api/lib/api_gen_utils.h" -#include "paddle/phi/api/lib/kernel_dispatch.h" -#include "paddle/phi/api/lib/utils/storage.h" -#include "paddle/phi/core/compat/convert_utils.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/core/meta_tensor.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace experimental { - -void copy(const Tensor& src, Place place, bool blocking, Tensor* dst) { - auto kernel_key_set = ParseKernelKeyByInputArgs(src); - kernel_key_set.backend_set = - kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); - auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); - auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( - "copy", kernel_key); - - VLOG(6) << "copy API kernel key: " << kernel_key; - VLOG(6) << "copy API kernel: " << kernel; - - auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - - auto dense_x = TensorToDenseTensor(src); - - auto kernel_out = SetKernelOutput(kernel_key.backend(), dst); - phi::MetaTensor meta_out(kernel_out); - phi::UnchangedInferMeta(*dense_x, &meta_out); - - using kernel_signature = void (*)(const platform::DeviceContext&, - const phi::DenseTensor&, - phi::Place, - bool, - phi::DenseTensor*); - - auto* kernel_fn = kernel.GetVariadicKernelFn(); - (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); -} - -} // namespace experimental -} // namespace paddle diff --git a/paddle/phi/api/lib/tensor_copy.h b/paddle/phi/api/lib/tensor_copy.h deleted file mode 100644 index 3ce45853319ec..0000000000000 --- a/paddle/phi/api/lib/tensor_copy.h +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/api/include/tensor.h" - -namespace paddle { -namespace experimental { - -void copy(const Tensor& src, Place place, bool blocking, Tensor* dst); - -} // namespace experimental -} // namespace paddle diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt index de97e7516f619..94a16da2b7720 100644 --- a/paddle/phi/api/lib/utils/CMakeLists.txt +++ b/paddle/phi/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS -tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor scalar) +tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar string_tensor) diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index aa839eab587cb..9bf692703860f 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(phi_place SRCS place.cc) -cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor) +cc_library(scalar SRCS scalar.cc DEPS phi_enforce) diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc index 41f1c9541823d..5cd55c1e88bed 100644 --- a/paddle/phi/common/scalar.cc +++ b/paddle/phi/common/scalar.cc @@ -14,32 +14,21 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/place.h" #include "paddle/phi/core/enforce.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/place.h" namespace paddle { namespace experimental { -// The Tensor must have one dim -template <> -ScalarBase::ScalarBase(const phi::DenseTensor& tensor_in) - : dtype_(tensor_in.dtype()) { // NOLINT - PADDLE_ENFORCE_EQ(tensor_in.numel(), +// NOTE(xiongkun): why we put definition here? +// test_custom_op can't include enforce.h, because enforce.h includes gflags. +// so we decouple the include dependence of enforce.h by link. +void ThrowTensorConvertError(int num) { + PADDLE_ENFORCE_EQ(num, 1, phi::errors::InvalidArgument( "The Scalar only supports Tensor with 1 element, but " "now Tensor has `%d` elements", - tensor_in.numel())); - auto cpu_place = phi::CPUPlace(); - if (!paddle::platform::is_same_place(tensor_in.place(), cpu_place)) { - phi::DenseTensor tensor; - framework::TensorCopySync(tensor_in, cpu_place, &tensor); - GetDataFromTensor(tensor); - } else { - GetDataFromTensor(tensor_in); - } + num)); } } // namespace experimental diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h index c28f6185a556a..5134f4eb72639 100644 --- a/paddle/phi/common/scalar.h +++ b/paddle/phi/common/scalar.h @@ -23,6 +23,8 @@ limitations under the License. */ namespace paddle { namespace experimental { +void ThrowTensorConvertError(int); + template class ScalarBase { public: @@ -103,7 +105,50 @@ class ScalarBase { } // The Tensor must have one dim - ScalarBase(const T& tensor_in); // NOLINT + ScalarBase(const T& tensor) : dtype_(tensor.dtype()) { // NOLINT + is_from_tensor_ = true; + ThrowTensorConvertError(tensor.numel()); + switch (dtype_) { + case DataType::FLOAT32: + data_.f32 = tensor.template data()[0]; + break; + case DataType::FLOAT64: + data_.f64 = tensor.template data()[0]; + break; + case DataType::FLOAT16: + data_.f16 = tensor.template data()[0]; + break; + case DataType::BFLOAT16: + data_.bf16 = tensor.template data()[0]; + break; + case DataType::INT32: + data_.i32 = tensor.template data()[0]; + break; + case DataType::INT64: + data_.i64 = tensor.template data()[0]; + break; + case DataType::INT16: + data_.i16 = tensor.template data()[0]; + break; + case DataType::INT8: + data_.i8 = tensor.template data()[0]; + break; + case DataType::UINT8: + data_.ui8 = tensor.template data()[0]; + break; + case DataType::BOOL: + data_.b = tensor.template data()[0]; + break; + case DataType::COMPLEX64: + data_.c64 = tensor.template data()[0]; + break; + case DataType::COMPLEX128: + data_.c128 = tensor.template data()[0]; + break; + default: + PD_THROW("Invalid tensor data type `", dtype_, "`."); + } + } template ScalarBase(const ScalarBase& other) { @@ -155,49 +200,6 @@ class ScalarBase { private: template friend void CopyScalar(const ScalarBase& src, ScalarBase* dst); - void GetDataFromTensor(const T& tensor) { - is_from_tensor_ = true; - switch (dtype_) { - case DataType::FLOAT32: - data_.f32 = tensor.template data()[0]; - break; - case DataType::FLOAT64: - data_.f64 = tensor.template data()[0]; - break; - case DataType::FLOAT16: - data_.f16 = tensor.template data()[0]; - break; - case DataType::BFLOAT16: - data_.bf16 = tensor.template data()[0]; - break; - case DataType::INT32: - data_.i32 = tensor.template data()[0]; - break; - case DataType::INT64: - data_.i64 = tensor.template data()[0]; - break; - case DataType::INT16: - data_.i16 = tensor.template data()[0]; - break; - case DataType::INT8: - data_.i8 = tensor.template data()[0]; - break; - case DataType::UINT8: - data_.ui8 = tensor.template data()[0]; - break; - case DataType::BOOL: - data_.b = tensor.template data()[0]; - break; - case DataType::COMPLEX64: - data_.c64 = tensor.template data()[0]; - break; - case DataType::COMPLEX128: - data_.c128 = tensor.template data()[0]; - break; - default: - PD_THROW("Invalid tensor data type `", dtype_, "`."); - } - } private: bool is_from_tensor_{false}; diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 23574e98fbf17..b42b4388c2ce1 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -23,7 +23,7 @@ cc_library(string_tensor SRCS string_tensor.cc DEPS convert_utils tensor_meta te cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(selected_rows SRCS selected_rows_impl.cc selected_rows.cc DEPS tensor_base dense_tensor phi_enforce ddim memcpy) +cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory) diff --git a/paddle/phi/core/selected_rows.cc b/paddle/phi/core/selected_rows.cc deleted file mode 100644 index dcf9c4182157a..0000000000000 --- a/paddle/phi/core/selected_rows.cc +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/selected_rows.h" - -namespace phi { - -SelectedRows::SelectedRows(const std::vector& rows, - const int64_t& height) - : impl_(std::make_shared(rows, height)) {} - -SelectedRows::SelectedRows() - : impl_(std::make_shared()) {} - -} // namespace phi diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h index a71c0471cc431..7ee475b4d5d9e 100644 --- a/paddle/phi/core/selected_rows.h +++ b/paddle/phi/core/selected_rows.h @@ -42,9 +42,10 @@ class SelectedRows : public TensorBase, * */ public: - SelectedRows(const std::vector& rows, const int64_t& height); + SelectedRows(const std::vector& rows, const int64_t& height) + : impl_(std::make_shared(rows, height)) {} - SelectedRows(); + SelectedRows() : impl_(std::make_shared()) {} const DenseTensor& value() const { return impl_->value(); } diff --git a/paddle/phi/core/utils/type_registry.h b/paddle/phi/core/utils/type_registry.h index f27c3db2275c3..8d9f9167242c8 100644 --- a/paddle/phi/core/utils/type_registry.h +++ b/paddle/phi/core/utils/type_registry.h @@ -51,7 +51,7 @@ TypeInfo TypeRegistry::RegisterType(const std::string& type) { std::lock_guard guard(mutex_); assert(name_to_id_.find(type) == name_to_id_.end()); assert(names_.size() < std::numeric_limits::max()); - int8_t id = static_cast(names_.size()); + int8_t id = names_.size(); names_.emplace_back(type); name_to_id_[type] = id; return TypeInfo(id); diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index f2acfe5a9962b..5fecd3740e930 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1887,7 +1887,7 @@ void RnnInferMeta(const MetaTensor& x, } } -void SGDInferMeta(const MetaTensor& param, +void SgdInferMeta(const MetaTensor& param, const MetaTensor& learning_rate, const MetaTensor& grad, paddle::optional master_param, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index c037641d082b7..9137b574ac09d 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -292,7 +292,7 @@ void RnnInferMeta(const MetaTensor& x, std::vector state, MetaTensor* reserve); -void SGDInferMeta(const MetaTensor& param, +void SgdInferMeta(const MetaTensor& param, const MetaTensor& learning_rate, const MetaTensor& grad, paddle::optional master_param, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index a47fc698777f7..7b50a37ac149f 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -405,6 +405,78 @@ void EighInferMeta(const MetaTensor& x, out_v->set_dims(input_dim); } +void ExpandInferMeta(const MetaTensor& x, + const IntArray& shape, + MetaTensor* out) { +#define MAX_RANK_SUPPORTED 6 + auto x_dims = x.dims(); + auto expand_shape = shape.GetData(); + + if (expand_shape.size() == 0) { + expand_shape = std::vector(x_dims.size(), -1); + } + + PADDLE_ENFORCE_GE( + expand_shape.size(), + static_cast(x_dims.size()), + phi::errors::InvalidArgument( + "The number of elements (%d) of 'shape' for " + "expand_v2 op must be greater than or equal to the rank " + "(%d) of the input.", + expand_shape.size(), + static_cast(x_dims.size()))); + PADDLE_ENFORCE_LE( + expand_shape.size(), + MAX_RANK_SUPPORTED, + phi::errors::InvalidArgument("The number of elements (%d) of 'shape' for " + "must not be greater than %d.", + expand_shape.size(), + MAX_RANK_SUPPORTED)); + PADDLE_ENFORCE_GE( + expand_shape.size(), + 1, + phi::errors::InvalidArgument("The number of elements (%d) of 'shape' for " + "must be a positive integer.", + expand_shape.size())); + + auto out_rank = + std::max(static_cast(x_dims.size()), expand_shape.size()); + std::vector out_shape(out_rank); + auto x_dim_vec = phi::vectorize(x_dims); + auto diff = expand_shape.size() - x_dim_vec.size(); + x_dim_vec.insert(x_dim_vec.begin(), diff, -1); + for (size_t i = 0; i < expand_shape.size(); ++i) { + if (x_dims[i] == -1) { + out_shape[i] = -1; + } else if (expand_shape[i] == -1) { + if (static_cast(x_dims.size()) > i) { + out_shape[i] = x_dims[i]; + } else { + out_shape[i] = -1; + } + } else if (expand_shape[i] == -2) { + // We use -2 to represent the element in expand_shape is a var. + out_shape[i] = -1; + } else { + PADDLE_ENFORCE_GT( + expand_shape[i], + 0, + phi::errors::InvalidArgument( + "The %uth element of 'shape' for expand_v2 op must be " + "greater than 0, but the value given is %d.", + i, + expand_shape[i])); + out_shape[i] = expand_shape[i]; + } + } + + out->set_dims(make_ddim(out_shape)); + out->set_dtype(x.dtype()); + if (out_shape[0] == x_dims[0]) { + out->share_lod(x); + } +} + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index c49e4c88dd899..ac5040388b334 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -85,6 +85,10 @@ void EighInferMeta(const MetaTensor& x, MetaTensor* out_w, MetaTensor* out_v); +void ExpandInferMeta(const MetaTensor& x, + const IntArray& shape, + MetaTensor* out); + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, diff --git a/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h b/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h index 4e82cccac3422..17b6d7516e070 100644 --- a/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h @@ -20,7 +20,9 @@ namespace phi { template -void PoissonGradKernel(const Context& ctx, DenseTensor* x_grad) { +void PoissonGradKernel(const Context& ctx, + const DenseTensor& out_grad, + DenseTensor* x_grad) { ctx.template Alloc(x_grad); phi::funcs::SetConstant functor; functor(ctx, x_grad, static_cast(0)); diff --git a/paddle/phi/kernels/poisson_grad_kernel.h b/paddle/phi/kernels/poisson_grad_kernel.h index 21720474f4a12..3ef60d7a51676 100644 --- a/paddle/phi/kernels/poisson_grad_kernel.h +++ b/paddle/phi/kernels/poisson_grad_kernel.h @@ -20,6 +20,8 @@ namespace phi { template -void PoissonGradKernel(const Context& ctx, DenseTensor* x_grad); +void PoissonGradKernel(const Context& ctx, + const DenseTensor& out_grad, + DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/ops/compat/poisson_sig.cc b/paddle/phi/ops/compat/poisson_sig.cc index cb6ae28804669..e45640c11b6ee 100644 --- a/paddle/phi/ops/compat/poisson_sig.cc +++ b/paddle/phi/ops/compat/poisson_sig.cc @@ -18,7 +18,8 @@ namespace phi { KernelSignature PoissonGradOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("poisson_grad", {}, {}, {GradVarName("X")}); + return KernelSignature( + "poisson_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); } } // namespace phi diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt index dd4b7e62ec52f..94378aceff58c 100644 --- a/paddle/phi/tests/api/CMakeLists.txt +++ b/paddle/phi/tests/api/CMakeLists.txt @@ -11,14 +11,14 @@ cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS}) -cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar) +cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS}) -cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar) +cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS}) cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS}) diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt index ca6d20045d171..710ea3c066472 100644 --- a/paddle/phi/tests/common/CMakeLists.txt +++ b/paddle/phi/tests/common/CMakeLists.txt @@ -2,9 +2,3 @@ cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest) cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest) cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest) cc_test(phi_test_place SRCS test_place.cc DEPS phi_place) -if (WITH_GPU) - nv_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar) -endif() -if(WITH_ROCM) - hip_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar) -endif() diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu deleted file mode 100644 index 6b0caa175dc04..0000000000000 --- a/paddle/phi/tests/common/test_scalar.cu +++ /dev/null @@ -1,205 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include // NOLINT -#include "gtest/gtest.h" -#include "paddle/fluid/memory/allocation/allocator_facade.h" -#include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/api/lib/utils/allocator.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/complex.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/kernel_registry.h" - -PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); - -namespace phi { -namespace tests { - -using DDim = phi::DDim; -using float16 = phi::dtype::float16; -using complex64 = ::phi::dtype::complex; -using complex128 = ::phi::dtype::complex; - -__global__ void FillTensor(float* data) { data[0] = 1; } - -TEST(Scalar, ConstructFromDenseTensor1) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = 1; - phi::Scalar scalar_test(dense_x); - ASSERT_NEAR(1, scalar_test.to(), 1e-6); -} - -TEST(Scalar, ConstructFromDenseTensor2) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = 1; - phi::Scalar scalar_test(dense_x); - ASSERT_EQ(1, scalar_test.to()); -} - -TEST(Scalar, ConstructFromDenseTensor3) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = 1; - phi::Scalar scalar_test(dense_x); - ASSERT_EQ(1, scalar_test.to()); -} - -TEST(Scalar, ConstructFromDenseTensor4) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = true; - phi::Scalar scalar_test(dense_x); - ASSERT_EQ(true, scalar_test.to()); -} - -TEST(Scalar, ConstructFromDenseTensor5) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x(alloc.get(), - phi::DenseTensorMeta(phi::DataType::COMPLEX64, - phi::make_ddim({1}), - phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = 1; - phi::Scalar scalar_test(dense_x); - complex64 expected_value(1, 0); - EXPECT_TRUE(expected_value == scalar_test.to()); -} - -TEST(Scalar, ConstructFromDenseTensor6) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::CPUPlace()); - phi::DenseTensor dense_x(alloc.get(), - phi::DenseTensorMeta(phi::DataType::COMPLEX128, - phi::make_ddim({1}), - phi::DataLayout::NCHW)); - phi::CPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::CPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - dense_x_data[0] = 1; - phi::Scalar scalar_test(dense_x); - complex128 expected_value(1, 0); - EXPECT_TRUE(expected_value == scalar_test.to()); -} - -TEST(Scalar, ConstructFromDenseTensor7) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::GPUPlace()); - phi::DenseTensor dense_x( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); - phi::GPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::GPUPlace()) - .get()); - dev_ctx.Init(); - - auto* dense_x_data = dev_ctx.Alloc(&dense_x); - FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); - dev_ctx.Wait(); - phi::Scalar scalar_test(dense_x); - ASSERT_NEAR(1, scalar_test.to(), 1e-6); -} - -TEST(Scalar, ConstructFromTensor) { - // 1. create tensor - const auto alloc = - std::make_unique(phi::GPUPlace()); - auto dense_x = std::make_shared( - alloc.get(), - phi::DenseTensorMeta( - phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW)); - - phi::GPUContext dev_ctx; - dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() - .GetAllocator(phi::GPUPlace()) - .get()); - dev_ctx.Init(); - auto* dense_x_data = dev_ctx.Alloc(dense_x.get()); - FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data); - dev_ctx.Wait(); - paddle::experimental::Tensor x(dense_x); - paddle::experimental::Scalar scalar_test(x); - ASSERT_NEAR(1, scalar_test.to(), 1e-6); -} - -} // namespace tests -} // namespace phi diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt index 7d2fd90e6bb7b..824d188457815 100644 --- a/paddle/phi/tests/core/CMakeLists.txt +++ b/paddle/phi/tests/core/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel scalar) +cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel) cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor) cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc) cc_test(test_type_info SRCS test_type_info.cc) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index e8bde467e085d..2e2efa65d7007 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -923,12 +923,11 @@ function fetch_upstream_develop_if_not_exist() { } function check_whl_size() { - if [ ! "${pr_whl_size}" ];then - echo "pr whl size not found " - exit 1 - fi set +x + pr_whl_size=`du -m ${PADDLE_ROOT}/build/pr_whl/*.whl|awk '{print $1}'` + echo "pr_whl_size: ${pr_whl_size}" + dev_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` echo "dev_whl_size: ${dev_whl_size}" @@ -949,11 +948,20 @@ function check_whl_size() { } function generate_upstream_develop_api_spec() { - fetch_upstream_develop_if_not_exist - cur_branch=`git branch | grep \* | cut -d ' ' -f2` + cp ${PADDLE_ROOT}/python/requirements.txt /tmp + pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` + mkdir -p ${PADDLE_ROOT}/build/pr_whl && mv ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl/ + echo "pr_whl_size: ${pr_whl_size}" + + rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt + cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true` + if [[ ${cmake_change} ]];then + rm -rf ${PADDLE_ROOT}/build/third_party + fi + + cd ${PADDLE_ROOT} git checkout . git checkout -b develop_base_pr upstream/$BRANCH - startTime_firstBuild=`date +%s` dev_commit=`git log -1|head -1|awk '{print $2}'` dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl" @@ -964,21 +972,11 @@ function generate_upstream_develop_api_spec() { cmake_gen $1 build $2 fi - - cp ${PADDLE_ROOT}/python/requirements.txt /tmp - pr_whl_size=`du -m ${PADDLE_ROOT}/build/python/dist/*.whl|awk '{print $1}'` - echo "pr_whl_size: ${pr_whl_size}" - - - git checkout $cur_branch generate_api_spec "$1" "DEV" - git branch -D develop_base_pr - ENABLE_MAKE_CLEAN="ON" - rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt - cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true` - if [[ ${cmake_change} ]];then - rm -rf ${PADDLE_ROOT}/build/third_party - fi + + endTime_s=`date +%s` + echo "Build Time: $[ $endTime_s - $startTime_s ]s" + echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt } function generate_api_spec() { @@ -2997,15 +2995,13 @@ function main() { example_code=$? summary_check_problems $check_style_code $[${example_code_gpu} + ${example_code}] "$check_style_info" "${example_info_gpu}\n${example_info}" assert_api_spec_approvals - check_whl_size ;; build_and_check_cpu) set +e - generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number} cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} - check_sequence_op_unittest generate_api_spec ${PYTHON_ABI:-""} "PR" - check_whl_size + generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number} + check_sequence_op_unittest ;; build_and_check_gpu) set +e @@ -3022,6 +3018,9 @@ function main() { summary_check_problems $check_style_code $[${example_code_gpu} + ${example_code}] "$check_style_info" "${example_info_gpu}\n${example_info}" assert_api_spec_approvals ;; + check_whl_size) + check_whl_size + ;; build) cmake_gen ${PYTHON_ABI:-""} build ${parallel_number} diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 8242d8e3392ec..95db9d39c1ec4 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -44,6 +44,7 @@ from .. import compat as cpt import warnings from paddle import _C_ops +from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', @@ -1370,7 +1371,11 @@ def _append_optimize_op(self, block, param_and_grad): if find_master else None) lr = self._create_param_lr(param_and_grad) - if framework._non_static_mode(): + if in_dygraph_mode(): + _C_ops.final_state_sgd(param_and_grad[0], lr, param_and_grad[1], + master_weight, find_master) + return None + if _in_legacy_dygraph(): _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight, param_and_grad[0], master_weight) return None diff --git a/python/paddle/fluid/tests/unittests/init_process_group.py b/python/paddle/fluid/tests/unittests/init_process_group.py index c9c957572c515..17887a9d767c1 100644 --- a/python/paddle/fluid/tests/unittests/init_process_group.py +++ b/python/paddle/fluid/tests/unittests/init_process_group.py @@ -46,6 +46,11 @@ def test_init_process_group(self): group = paddle.distributed.collective.Group(-1, 2, 0, [-1, -2]) ret = paddle.distributed.barrier(group) assert ret == None + paddle.enable_static() + in_tensor = paddle.empty((1, 2)) + in_tensor2 = paddle.empty((1, 2)) + paddle.distributed.broadcast(in_tensor, src=0) + paddle.distributed.all_gather([in_tensor, in_tensor2], in_tensor) print("test ok\n") diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py index bc40d3b4c27d9..c7f724bdaae3f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py @@ -28,16 +28,16 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(dims, batch, attrs: List[Dict[str, Any]]): if dims == 1: - return np.ones([64]).astype(np.float32) + return np.ones([32]).astype(np.float32) elif dims == 2: - return np.ones([3, 64]).astype(np.float32) + return np.ones([3, 32]).astype(np.float32) elif dims == 3: - return np.ones([3, 64, 64]).astype(np.float32) + return np.ones([3, 32, 32]).astype(np.float32) else: - return np.ones([batch, 3, 64, 64]).astype(np.float32) + return np.ones([batch, 3, 32, 32]).astype(np.float32) for dims in [1, 2, 3, 4]: - for batch in [1, 2, 4]: + for batch in [1, 4]: for op_type in ["relu", "sigmoid", "tanh", "relu6"]: self.dims = dims dics = [{}] @@ -70,27 +70,25 @@ def sample_predictor_configs( def generate_dynamic_shape(attrs): if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [1]} - self.dynamic_shape.max_input_shape = {"input_data": [128]} - self.dynamic_shape.opt_input_shape = {"input_data": [64]} + self.dynamic_shape.max_input_shape = {"input_data": [64]} + self.dynamic_shape.opt_input_shape = {"input_data": [32]} elif self.dims == 2: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]} elif self.dims == 3: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 32]} - self.dynamic_shape.max_input_shape = { - "input_data": [10, 64, 64] - } - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]} else: self.dynamic_shape.min_input_shape = { - "input_data": [1, 3, 32, 32] + "input_data": [1, 3, 16, 16] } self.dynamic_shape.max_input_shape = { - "input_data": [4, 3, 64, 64] + "input_data": [4, 3, 32, 32] } self.dynamic_shape.opt_input_shape = { - "input_data": [1, 3, 64, 64] + "input_data": [1, 3, 32, 32] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py index 410cef798aa63..899cf0e263955 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_batch_norm.py @@ -54,7 +54,7 @@ def generate_MomentumTensor(attrs: List[Dict[str, Any]], batch): for dims in [2, 3, 4]: for num_input in [0, 1]: - for batch in [1, 2, 4]: + for batch in [1, 4]: for epsilon in [1e-6, 1e-5, 1e-4]: for data_layout in ["NCHW"]: for momentum in [0.9, 0.8]: @@ -134,33 +134,33 @@ def generate_dynamic_shape(attrs): if self.dims == 4: if attrs[0]['data_layout'] == "NCHW": self.dynamic_shape.min_input_shape = { - "batch_norm_input": [1, 3, 24, 24] + "batch_norm_input": [1, 3, 12, 12] } self.dynamic_shape.max_input_shape = { - "batch_norm_input": [4, 3, 48, 48] + "batch_norm_input": [4, 3, 24, 24] } self.dynamic_shape.opt_input_shape = { - "batch_norm_input": [1, 3, 24, 48] + "batch_norm_input": [1, 3, 24, 24] } elif attrs[0]['data_layout'] == "NHWC": self.dynamic_shape.min_input_shape = { - "batch_norm_input": [1, 24, 24, 3] + "batch_norm_input": [1, 12, 12, 3] } self.dynamic_shape.max_input_shape = { - "batch_norm_input": [4, 48, 48, 3] + "batch_norm_input": [4, 24, 24, 3] } self.dynamic_shape.opt_input_shape = { - "batch_norm_input": [1, 24, 48, 3] + "batch_norm_input": [1, 24, 24, 3] } elif self.dims == 3: self.dynamic_shape.min_input_shape = { - "batch_norm_input": [1, 3, 24] + "batch_norm_input": [1, 3, 12] } self.dynamic_shape.max_input_shape = { - "batch_norm_input": [4, 3, 48] + "batch_norm_input": [4, 3, 24] } self.dynamic_shape.opt_input_shape = { - "batch_norm_input": [1, 3, 48] + "batch_norm_input": [1, 3, 24] } elif self.dims == 2: self.dynamic_shape.min_input_shape = { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py index 5150622cf801d..1277cde011c17 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py @@ -28,13 +28,13 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(dims, batch, attrs: List[Dict[str, Any]]): if dims == 1: - return np.ones([64]).astype(np.float32) + return np.ones([32]).astype(np.float32) elif dims == 2: - return np.ones([3, 64]).astype(np.float32) + return np.ones([3, 32]).astype(np.float32) elif dims == 3: - return np.ones([3, 64, 64]).astype(np.float32) + return np.ones([3, 32, 32]).astype(np.float32) else: - return np.ones([batch, 3, 64, 64]).astype(np.float32) + return np.ones([batch, 3, 32, 32]).astype(np.float32) def generate_weight1(attrs: List[Dict[str, Any]]): return np.array([np.random.uniform(1, 10)]).astype("float32") @@ -43,7 +43,7 @@ def generate_weight2(attrs: List[Dict[str, Any]]): return np.array([np.random.uniform(10, 20)]).astype("float32") for dims in [1, 2, 3, 4]: - for batch in [1, 2, 4]: + for batch in [1, 4]: for op_inputs in [{ "X": ["input_data"] }, { @@ -89,27 +89,25 @@ def sample_predictor_configs(self, program_config): def generate_dynamic_shape(attrs): if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [1]} - self.dynamic_shape.max_input_shape = {"input_data": [128]} - self.dynamic_shape.opt_input_shape = {"input_data": [64]} + self.dynamic_shape.max_input_shape = {"input_data": [64]} + self.dynamic_shape.opt_input_shape = {"input_data": [32]} elif self.dims == 2: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]} elif self.dims == 3: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 32]} - self.dynamic_shape.max_input_shape = { - "input_data": [10, 64, 64] - } - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]} else: self.dynamic_shape.min_input_shape = { - "input_data": [1, 3, 32, 32] + "input_data": [1, 3, 16, 16] } self.dynamic_shape.max_input_shape = { - "input_data": [4, 3, 64, 64] + "input_data": [4, 3, 32, 32] } self.dynamic_shape.opt_input_shape = { - "input_data": [1, 3, 64, 64] + "input_data": [1, 3, 32, 32] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py index 5f85debf4b27c..84ef5b4da68ab 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d.py @@ -46,20 +46,16 @@ def sample_program_configs(self): self.trt_param.workspace_size = 1073741824 def generate_input1(batch, attrs: List[Dict[str, Any]]): - if attrs[0]['groups'] == 1: - return np.ones([batch, 3, 64, 64]).astype(np.float32) - elif attrs[0]['groups'] == 2: - return np.ones([batch, 6, 64, 64]).astype(np.float32) - else: - return np.ones([batch, 9, 64, 64]).astype(np.float32) + return np.ones( + [batch, attrs[0]['groups'] * 3, 64, 64]).astype(np.float32) def generate_weight1(attrs: List[Dict[str, Any]]): return np.random.random([24, 3, 3, 3]).astype(np.float32) - for batch in [1, 2, 4]: + for batch in [1, 4]: for strides in [[1, 1], [2, 2], [1, 2]]: for paddings in [[0, 3], [1, 2, 3, 4]]: - for groups in [1, 2, 3]: + for groups in [1, 3]: for padding_algorithm in ['EXPLICIT', 'SAME', 'VALID']: for dilations in [[1, 1], [2, 2], [1, 2]]: for data_format in ['NCHW']: @@ -116,45 +112,19 @@ def generate_weight1(attrs: List[Dict[str, Any]]): def sample_predictor_configs( self, program_config) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): - if attrs[0]['groups'] == 1: - self.dynamic_shape.min_input_shape = { - "input_data": [1, 3, 32, 32], - "output_data": [1, 24, 32, 32] - } - self.dynamic_shape.max_input_shape = { - "input_data": [4, 3, 64, 64], - "output_data": [4, 24, 64, 64] - } - self.dynamic_shape.opt_input_shape = { - "input_data": [1, 3, 64, 64], - "output_data": [1, 24, 64, 64] - } - elif attrs[0]['groups'] == 2: - self.dynamic_shape.min_input_shape = { - "input_data": [1, 6, 32, 32], - "output_data": [1, 24, 32, 32] - } - self.dynamic_shape.max_input_shape = { - "input_data": [4, 6, 64, 64], - "output_data": [4, 24, 64, 64] - } - self.dynamic_shape.opt_input_shape = { - "input_data": [1, 6, 64, 64], - "output_data": [1, 24, 64, 64] - } - else: - self.dynamic_shape.min_input_shape = { - "input_data": [1, 9, 32, 32], - "output_data": [1, 24, 32, 32] - } - self.dynamic_shape.max_input_shape = { - "input_data": [4, 9, 64, 64], - "output_data": [4, 24, 64, 64] - } - self.dynamic_shape.opt_input_shape = { - "input_data": [1, 9, 64, 64], - "output_data": [1, 24, 64, 64] - } + input_groups = attrs[0]['groups'] * 3 + self.dynamic_shape.min_input_shape = { + "input_data": [1, input_groups, 32, 32], + "output_data": [1, 24, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, input_groups, 64, 64], + "output_data": [4, 24, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, input_groups, 64, 64], + "output_data": [1, 24, 64, 64] + } def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py index b1b5626c10eb9..8a9a9909571a4 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_fusion.py @@ -49,10 +49,8 @@ def sample_program_configs(self): self.trt_param.workspace_size = 1073741824 def generate_input1(batch, attrs: List[Dict[str, Any]]): - if attrs[0]['groups'] == 2: - return np.ones([batch, 6, 64, 64]).astype(np.float32) - else: - return np.ones([batch, 9, 64, 64]).astype(np.float32) + return np.ones( + [batch, attrs[0]['groups'] * 3, 64, 64]).astype(np.float32) def generate_weight1(attrs: List[Dict[str, Any]]): return np.random.random([24, 3, 3, 3]).astype(np.float32) @@ -60,7 +58,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]): def generate_weight2(attrs: List[Dict[str, Any]]): return np.random.random([24, 1, 1]).astype(np.float32) - for batch in [1, 2, 4]: + for batch in [1, 4]: for strides in [[1, 1], [2, 2], [1, 2]]: for paddings in [[0, 3], [1, 2, 3, 4]]: for groups in [2, 3]: @@ -126,32 +124,19 @@ def generate_weight2(attrs: List[Dict[str, Any]]): def sample_predictor_configs( self, program_config) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): - if attrs[0]['groups'] == 2: - self.dynamic_shape.min_input_shape = { - "input_data": [1, 6, 32, 32], - "output_data": [1, 24, 32, 32] - } - self.dynamic_shape.max_input_shape = { - "input_data": [4, 6, 64, 64], - "output_data": [4, 24, 64, 64] - } - self.dynamic_shape.opt_input_shape = { - "input_data": [1, 6, 64, 64], - "output_data": [1, 24, 64, 64] - } - else: - self.dynamic_shape.min_input_shape = { - "input_data": [1, 9, 32, 32], - "output_data": [1, 24, 32, 32] - } - self.dynamic_shape.max_input_shape = { - "input_data": [4, 9, 64, 64], - "output_data": [4, 24, 64, 64] - } - self.dynamic_shape.opt_input_shape = { - "input_data": [1, 9, 64, 64], - "output_data": [1, 24, 64, 64] - } + input_groups = attrs[0]['groups'] * 3 + self.dynamic_shape.min_input_shape = { + "input_data": [1, input_groups, 32, 32], + "output_data": [1, 24, 32, 32] + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, input_groups, 64, 64], + "output_data": [4, 24, 64, 64] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [1, input_groups, 64, 64], + "output_data": [1, 24, 64, 64] + } def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index e849496621a10..ec02a357a48b6 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -32,7 +32,7 @@ def generate_input(shape): def generate_weight(): return np.random.randn(32).astype(np.float32) - for batch in [1, 2, 4]: + for batch in [1, 4]: for shape in [[32], [batch, 32], [batch, 32, 32], [batch, 32, 16, 32]]: for op_type in ["elementwise_add", "elementwise_mul"]: @@ -72,7 +72,7 @@ def generate_dynamic_shape(attrs): # The input.dims[1] must be equal to the weight's length. if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [4]} - self.dynamic_shape.max_input_shape = {"input_data": [256]} + self.dynamic_shape.max_input_shape = {"input_data": [32]} self.dynamic_shape.opt_input_shape = {"input_data": [16]} elif self.dims == 2: self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} @@ -80,19 +80,17 @@ def generate_dynamic_shape(attrs): self.dynamic_shape.opt_input_shape = {"input_data": [2, 32]} elif self.dims == 3: self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 4]} - self.dynamic_shape.max_input_shape = { - "input_data": [4, 32, 256] - } - self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 32]} elif self.dims == 4: self.dynamic_shape.min_input_shape = { "input_data": [1, 32, 4, 4] } self.dynamic_shape.max_input_shape = { - "input_data": [4, 32, 128, 256] + "input_data": [4, 32, 32, 32] } self.dynamic_shape.opt_input_shape = { - "input_data": [2, 32, 32, 16] + "input_data": [4, 32, 16, 32] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py index e79b33d88d3f1..448e4e3e71b02 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py @@ -28,13 +28,13 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(dims, attrs: List[Dict[str, Any]]): if dims == 1: - return np.ones([64]).astype(np.float32) + return np.ones([32]).astype(np.float32) elif dims == 2: - return np.ones([3, 64]).astype(np.float32) + return np.ones([3, 32]).astype(np.float32) elif dims == 3: - return np.ones([3, 64, 64]).astype(np.float32) + return np.ones([3, 32, 32]).astype(np.float32) else: - return np.ones([1, 3, 64, 64]).astype(np.float32) + return np.ones([1, 3, 32, 32]).astype(np.float32) for dims in [1, 2, 3, 4]: for approximate in [True, False]: @@ -69,27 +69,25 @@ def sample_predictor_configs( def generate_dynamic_shape(attrs): if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [1]} - self.dynamic_shape.max_input_shape = {"input_data": [128]} - self.dynamic_shape.opt_input_shape = {"input_data": [64]} + self.dynamic_shape.max_input_shape = {"input_data": [64]} + self.dynamic_shape.opt_input_shape = {"input_data": [32]} elif self.dims == 2: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]} elif self.dims == 3: - self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 32]} - self.dynamic_shape.max_input_shape = { - "input_data": [10, 64, 64] - } - self.dynamic_shape.opt_input_shape = {"input_data": [3, 64, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]} else: self.dynamic_shape.min_input_shape = { - "input_data": [1, 3, 32, 32] + "input_data": [1, 3, 16, 16] } self.dynamic_shape.max_input_shape = { - "input_data": [4, 3, 64, 64] + "input_data": [4, 3, 32, 32] } self.dynamic_shape.opt_input_shape = { - "input_data": [1, 3, 64, 64] + "input_data": [1, 3, 32, 32] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py index 969f0e8b148a2..b3f118e9fbf52 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py @@ -29,8 +29,8 @@ def sample_program_configs(self): def generate_input(shape): return np.random.random(shape).astype(np.float32) - for batch in [1, 2, 4]: - for shape in [[batch, 64], [batch, 32, 64], [batch, 64, 32, 128]]: + for batch in [1, 4]: + for shape in [[batch, 32], [batch, 16, 32], [batch, 32, 16, 128]]: self.input_dim = len(shape) for slope in [0.1, 0.5]: for offset in [0.2, 0.7]: @@ -63,23 +63,21 @@ def sample_predictor_configs( def generate_dynamic_shape(attrs): if self.input_dim == 2: self.dynamic_shape.min_input_shape = {"input_data": [1, 8]} - self.dynamic_shape.max_input_shape = {"input_data": [64, 128]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} self.dynamic_shape.opt_input_shape = {"input_data": [2, 16]} elif self.input_dim == 3: self.dynamic_shape.min_input_shape = {"input_data": [1, 8, 8]} - self.dynamic_shape.max_input_shape = { - "input_data": [64, 128, 256] - } - self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 64]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 16, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [4, 16, 32]} elif self.input_dim == 4: self.dynamic_shape.min_input_shape = { "input_data": [1, 8, 8, 4] } self.dynamic_shape.max_input_shape = { - "input_data": [64, 128, 256, 512] + "input_data": [4, 32, 16, 128] } self.dynamic_shape.opt_input_shape = { - "input_data": [2, 16, 64, 128] + "input_data": [4, 32, 16, 128] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py index 283a19ec00574..c092d6da86839 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py @@ -37,7 +37,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: def sample_program_configs(self): def generate_input1(attrs: List[Dict[str, Any]]): - return np.ones([1, 3, 64, 64]).astype(np.float32) + return np.ones([1, 3, 32, 32]).astype(np.float32) for threshold in [6.0, 7.0, 100.0, 0.0, -1.0]: for scale in [5.0, 6.0, 7.0, -1.0, 0.0, 100.0]: @@ -74,9 +74,9 @@ def generate_input1(attrs: List[Dict[str, Any]]): def sample_predictor_configs( self, program_config) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): - self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]} - self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 64, 64]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 16, 16]} + self.dynamic_shape.max_input_shape = {"input_data": [2, 3, 32, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [1, 3, 32, 32]} def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py index 10109cdc73a2b..00e3f7feb6022 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py @@ -136,7 +136,7 @@ def generate_dynamic_shape(attrs): "input_data": [1, 1], } self.dynamic_shape.max_input_shape = { - "input_data": [4, 64], + "input_data": [4, 32], } self.dynamic_shape.opt_input_shape = { "input_data": [2, 3], @@ -146,7 +146,7 @@ def generate_dynamic_shape(attrs): "input_data": [1, 1, 1, 1], } self.dynamic_shape.max_input_shape = { - "input_data": [4, 64, 128, 128], + "input_data": [4, 3, 16, 32], } self.dynamic_shape.opt_input_shape = { "input_data": [2, 3, 16, 32], @@ -156,10 +156,10 @@ def generate_dynamic_shape(attrs): "input_data": [1, 1, 1], } self.dynamic_shape.max_input_shape = { - "input_data": [4, 64, 256], + "input_data": [4, 3, 32], } self.dynamic_shape.opt_input_shape = { - "input_data": [2, 3, 128], + "input_data": [2, 3, 16], } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py new file mode 100644 index 0000000000000..1b3d38036614f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py @@ -0,0 +1,124 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest + + +class TrtConvertRollTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + inputs = program_config.inputs + weights = program_config.weights + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + return True + + def sample_program_configs(self): + def generate_input1(attrs: List[Dict[str, Any]]): + return np.ones([1, 56, 56, 192]).astype(np.float32) + + for axis in [[1, 2]]: + for shifts in [[-1, -1], [-3, -3]]: + dics = [{ + "axis": axis, + "shifts": shifts, + }] + + ops_config = [{ + "op_type": "roll", + "op_inputs": { + "X": ["input_data"] + }, + "op_outputs": { + "Out": ["roll_output_data"] + }, + "op_attrs": dics[0] + }] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": + TensorConfig(data_gen=partial(generate_input1, dics)) + }, + outputs=["roll_output_data"]) + + yield program_config + + def sample_predictor_configs( + self, program_config) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = { + "input_data": [1, 56, 56, 192] + } + self.dynamic_shape.max_input_shape = { + "input_data": [8, 56, 56, 192] + } + self.dynamic_shape.opt_input_shape = { + "input_data": [4, 56, 56, 192] + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + inputs = program_config.inputs + + if not dynamic_shape: + return 0, 3 + ver = paddle_infer.get_trt_compile_version() + if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000: + return 0, 3 + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-4 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-4 + + def test(self): + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py index 62e7a10327747..d607a43739eb7 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scale.py @@ -94,14 +94,14 @@ def generate_dynamic_shape(attrs): "scale_input": [1, 3, 24, 24] } self.dynamic_shape.max_input_shape = { - "scale_input": [9, 3, 48, 48] + "scale_input": [4, 3, 24, 24] } self.dynamic_shape.opt_input_shape = { - "scale_input": [1, 3, 48, 24] + "scale_input": [1, 3, 24, 24] } elif self.dims == 3: self.dynamic_shape.min_input_shape = {"scale_input": [1, 3, 24]} - self.dynamic_shape.max_input_shape = {"scale_input": [9, 6, 48]} + self.dynamic_shape.max_input_shape = {"scale_input": [4, 3, 24]} self.dynamic_shape.opt_input_shape = {"scale_input": [1, 3, 24]} elif self.dims == 2: self.dynamic_shape.min_input_shape = {"scale_input": [1, 24]} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py index 93ba5da9d66d9..062312b0fab4f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py @@ -69,7 +69,7 @@ def generate_input3(attrs: List[Dict[str, Any]], batch): return np.ones([24]).astype(np.float32) for dims in [1, 2, 3, 4]: - for batch in [1, 2, 4]: + for batch in [1, 4]: for axis in [-2, -1, 0, 1, 2, 3]: self.dims = dims dics = [{"axis": axis}, {}] diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py index 17955c6e007d9..269523661ee4d 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py @@ -37,7 +37,7 @@ def generate_input1(attrs: List[Dict[str, Any]], batch, channel): def generate_input2(attrs: List[Dict[str, Any]], batch): return np.random.random([batch, 2]).astype(np.int32) - for batch in [1, 2, 4]: + for batch in [1, 4]: for class_num in [80, 30]: for anchors in [[10, 13, 16, 30, 33, 23]]: for downsample_ratio in [32, 16]: @@ -97,24 +97,24 @@ def generate_dynamic_shape(attrs): if attrs[0]['iou_aware'] == True: channel = 3 * (attrs[0]['class_num'] + 6) self.dynamic_shape.min_input_shape = { - "scale_input": [1, channel, 24, 24] + "scale_input": [1, channel, 12, 12] } self.dynamic_shape.max_input_shape = { - "scale_input": [4, channel, 48, 48] + "scale_input": [4, channel, 24, 24] } self.dynamic_shape.opt_input_shape = { - "scale_input": [1, channel, 24, 48] + "scale_input": [1, channel, 24, 24] } else: channel = 3 * (attrs[0]['class_num'] + 5) self.dynamic_shape.min_input_shape = { - "scale_input": [1, channel, 24, 24] + "scale_input": [1, channel, 12, 12] } self.dynamic_shape.max_input_shape = { - "scale_input": [4, channel, 48, 48] + "scale_input": [4, channel, 24, 24] } self.dynamic_shape.opt_input_shape = { - "scale_input": [1, channel, 24, 48] + "scale_input": [1, channel, 24, 24] } def clear_dynamic_shape(): diff --git a/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py b/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py index 2da5b770d052c..29e3436948e98 100644 --- a/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_compare_reduce_op.py @@ -28,12 +28,13 @@ def setUp(self): x = np.random.random(size=(10, 7)).astype(typename) y = np.random.random(size=(10, 7)).astype(typename) z = callback(x, y) + self.python_api = paddle.tensor.equal_all self.inputs = {'X': x, 'Y': y} self.outputs = {'Out': z} self.op_type = op_type def test_output(self): - self.check_output() + self.check_output(check_eager=True) cls_name = "{0}_{1}_{2}".format(op_type, typename, 'not_equal_all') Cls.__name__ = cls_name @@ -46,12 +47,13 @@ def setUp(self): x = np.random.random(size=(10, 7)).astype(typename) y = np.random.random(size=(10)).astype(typename) z = callback(x, y) + self.python_api = paddle.tensor.equal_all self.inputs = {'X': x, 'Y': y} self.outputs = {'Out': z} self.op_type = op_type def test_output(self): - self.check_output() + self.check_output(check_eager=True) cls_name = "{0}_{1}_{2}".format(op_type, typename, 'not_shape_equal_all') Cls.__name__ = cls_name @@ -63,12 +65,13 @@ class Cls(op_test.OpTest): def setUp(self): x = y = np.random.random(size=(10, 7)).astype(typename) z = callback(x, y) + self.python_api = paddle.tensor.equal_all self.inputs = {'X': x, 'Y': y} self.outputs = {'Out': z} self.op_type = op_type def test_output(self): - self.check_output() + self.check_output(check_eager=True) cls_name = "{0}_{1}_{2}".format(op_type, typename, 'equal_all') Cls.__name__ = cls_name @@ -82,12 +85,13 @@ def setUp(self): x = np.array([True, False, True]).astype(typename) x = np.array([False, False, True]).astype(typename) z = callback(x, y) + self.python_api = paddle.tensor.equal_all self.inputs = {'X': x, 'Y': y} self.outputs = {'Out': z} self.op_type = op_type def test_output(self): - self.check_output() + self.check_output(check_eager=True) cls_name = "{0}_{1}_{2}".format(op_type, typename, 'equal_all') Cls.__name__ = cls_name diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py index 70b3fda79b50f..fd46b41c5f07e 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py @@ -40,10 +40,10 @@ def init_data(self): self.expand_times = [1] def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', check_eager=True) class TestExpandV2OpRank2_DimExpanding(TestExpandV2OpRank1): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py index f0c5316412f1e..3a8a3a96e9a33 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py @@ -209,7 +209,9 @@ def func_example_with_gradient_and_create_graph(self): self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected)) def test_all_cases(self): - if _in_legacy_dygraph(): + self.func_exception() + self.func_example_with_gradient_and_create_graph() + with _test_eager_guard(): self.func_exception() self.func_example_with_gradient_and_create_graph() @@ -296,7 +298,8 @@ def func_example_with_gradient_and_create_graph(self): self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected)) def test_all_cases(self): - if _in_legacy_dygraph(): + self.func_example_with_gradient_and_create_graph() + with _test_eager_guard(): self.func_example_with_gradient_and_create_graph() diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py index f8183bb5f8db2..7dd3841fe4bcb 100644 --- a/python/paddle/fluid/tests/unittests/test_poisson_op.py +++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py @@ -18,6 +18,7 @@ from op_test import OpTest import math import os +from paddle.fluid.framework import _test_eager_guard paddle.enable_static() paddle.seed(100) @@ -96,11 +97,18 @@ def test_static(self): self.assertTrue(np.min(y_np) >= 0) def test_dygraph(self): - paddle.disable_static() - x = paddle.randn([10, 10], dtype='float32') - y = paddle.poisson(x) - self.assertTrue(np.min(y.numpy()) >= 0) - paddle.enable_static() + with paddle.fluid.dygraph.base.guard(): + x = paddle.randn([10, 10], dtype='float32') + y = paddle.poisson(x) + self.assertTrue(np.min(y.numpy()) >= 0) + + with _test_eager_guard(): + x = paddle.randn([10, 10], dtype='float32') + x.stop_gradient = False + y = paddle.poisson(x) + y.backward() + self.assertTrue(np.min(y.numpy()) >= 0) + self.assertTrue(np.array_equal(np.zeros_like(x), x.gradient())) def test_fixed_random_number(self): # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py index 817150a21f5e5..ad03fa30009e7 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py @@ -21,6 +21,7 @@ from paddle.fluid.op import Operator from op_test import OpTest import paddle +from paddle.fluid.framework import _test_eager_guard paddle.enable_static() @@ -291,6 +292,11 @@ def test_sgd_group_dygraph(self): adam.step() adam.clear_gradients() + def test_eager(self): + with _test_eager_guard(): + self.test_sgd_dygraph() + self.test_sgd_group_dygraph() + class TestSGDMultiPrecision2_0(unittest.TestCase): def dygraph_sgd_mp(self, mp): diff --git a/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py b/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py index 45e88d681d8e0..4e79e8dca138e 100644 --- a/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py +++ b/python/paddle/fluid/tests/unittests/test_triangular_solve_op.py @@ -47,6 +47,7 @@ def set_output(self): def setUp(self): self.op_type = "triangular_solve" + self.python_api = paddle.tensor.linalg.triangular_solve self.config() self.inputs = { @@ -62,10 +63,10 @@ def setUp(self): self.outputs = {'Out': self.output} def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad_normal(self): - self.check_grad(['X', 'Y'], 'Out') + self.check_grad(['X', 'Y'], 'Out', check_eager=True) # 2D(broadcast) + 3D, test 'transpose' diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py index fdee57bb1253e..46dd0b73a5eb8 100644 --- a/python/paddle/optimizer/sgd.py +++ b/python/paddle/optimizer/sgd.py @@ -22,6 +22,7 @@ from ..fluid.layer_helper import LayerHelper from ..fluid import unique_name from ..fluid import layers +from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode __all__ = [] @@ -144,7 +145,11 @@ def _append_optimize_op(self, block, param_and_grad): if find_master else None) lr = self._create_param_lr(param_and_grad) - if framework._non_static_mode(): + if in_dygraph_mode(): + _C_ops.final_state_sgd(param_and_grad[0], lr, param_and_grad[1], + master_weight, find_master) + return None + if _in_legacy_dygraph(): _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight, param_and_grad[0], master_weight) return None diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index b315e3e9673fc..a00ae8046ed68 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -2834,6 +2834,10 @@ def triangular_solve(x, print(out) # [7, -2, -5] """ + if in_dygraph_mode(): + return _C_ops.final_state_triangular_solve(x, y, upper, transpose, + unitriangular) + if paddle.in_dynamic_mode(): return _C_ops.triangular_solve(x, y, 'upper', upper, 'transpose', transpose, 'unitriangular', diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py index 6a18e1201785a..d99b9973b485e 100755 --- a/python/paddle/tensor/logic.py +++ b/python/paddle/tensor/logic.py @@ -301,6 +301,9 @@ def equal_all(x, y, name=None): result2 = paddle.equal_all(x, z) print(result2) # result2 = [False ] """ + if in_dygraph_mode(): + return _C_ops.final_state_equal_all(x, y) + if paddle.in_dynamic_mode(): return _C_ops.equal_all(x, y) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 389b5dbd7dbec..3a79abd2dc06e 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -2000,6 +2000,9 @@ def expand(x, shape, name=None): print(out) # [[1, 2, 3], [1, 2, 3]] """ + if in_dygraph_mode(): + return _C_ops.final_state_expand(x, shape) + if paddle.in_dynamic_mode(): return _C_ops.expand_v2(x, 'shape', shape) diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 6387525fa26f1..b4abe5b303b8e 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -603,6 +603,14 @@ kernel : func : equal +- api : equal_all + args : (Tensor x, Tensor y) + output : Tensor + infer_meta : + func : CompareAllInferMeta + kernel : + func : equal_all + # erf - api : erf args : (Tensor x) @@ -633,6 +641,16 @@ func : exp backward : exp_grad +# expand +- api : expand + args : (Tensor x, IntArray shape) + output : Tensor + infer_meta : + func : ExpandInferMeta + kernel : + func : expand + backward : expand_grad + # expand_as - api : expand_as args : (Tensor x, Tensor y, int[] target_shape) @@ -1513,7 +1531,7 @@ func : pixel_shuffle backward : pixel_shuffle_grad -# poisson // no need grad +# poisson - api : poisson args : (Tensor x) output : Tensor @@ -1521,6 +1539,7 @@ func : UnchangedInferMeta kernel : func : poisson + backward : poisson_grad - api : pool2d args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) @@ -1775,6 +1794,12 @@ func : selu backward : selu_grad +- api : sgd + args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision) + output : Tensor(param_out), Tensor(master_param_out) + invoke : sgd_impl(param, learning_rate, grad, master_param, multi_precision) + optional : master_param + - api : shape args : (Tensor input) output : Tensor @@ -2066,7 +2091,7 @@ func : TriangularSolveInferMeta kernel : func : triangular_solve - # backward : triangular_solve_grad + backward : triangular_solve_grad - api : tril_triu args : (Tensor x, int diagonal, bool lower) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index d243b4d160d57..d0f337cb054f4 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -492,6 +492,16 @@ func : expand_as_grad no_need_buffer : x +- backward_api : expand_grad + forward : expand (Tensor x, IntArray shape) -> Tensor(out) + args : (Tensor x, Tensor out_grad, IntArray shape) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : expand_grad + - backward_api : expm1_grad forward : expm1 (Tensor x) -> Tensor(out) args : (Tensor out, Tensor out_grad) @@ -1159,6 +1169,16 @@ kernel : func : pixel_shuffle_grad +- backward_api : poisson_grad + forward : poisson (Tensor x) -> Tensor(out) + args : (Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [out_grad] + kernel : + func : poisson_grad + - backward_api : pool2d_grad forward : pool2d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) -> Tensor(out) args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm) @@ -1438,7 +1458,7 @@ func : GeneralTernaryGradInferMeta param : [out, fwd_grad_out, grad_grad_x] kernel : - func : sigmoid_double_grad + func : sigmoid_triple_grad - backward_api : silu_grad forward : silu (Tensor x) -> Tensor(out) @@ -1685,6 +1705,16 @@ kernel : func : transpose_grad +- backward_api : triangular_solve_grad + forward : triangular_solve (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) -> Tensor(out) + args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper, bool tranpose, bool unitriangular) + output : Tensor(x_grad), Tensor(y_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, y] + kernel : + func : triangular_solve_grad + - backward_api : tril_triu_grad forward : tril_triu(Tensor x, int diagonal, bool lower) -> Tensor(out) args : (Tensor out_grad, int diagonal, bool lower) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index 49b84da01b9bb..ce67912eb2266 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -231,10 +231,10 @@ if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 6888866 39303645 fi -HAS_MODIFIED_DECLARATIONS=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels/declarations.h" || true` +HAS_MODIFIED_DECLARATIONS=`git diff -U0 upstream/$BRANCH |grep "^+" |grep "paddle/phi/kernels/declarations.h" || true` if [ "${HAS_MODIFIED_DECLARATIONS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - echo_line="You must be approved by chenwhql for any use of paddle/phi/kernels/declarations.h. Thanks!\n" - check_approval 1 22561442 + echo_line="You must be approved by chenwhql or zyfncg for paddle/phi/kernels/declarations.h using. Thanks!\n" + check_approval 1 chenwhql zyfncg fi ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`