From 9312d164f1c232321266e46fac31869b7aea30b3 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 11 May 2022 06:17:21 +0000 Subject: [PATCH 01/51] add init file --- paddle/fluid/operators/graph_send_e_recv_op.cc | 18 ++++++++++++++++++ .../cpu/graph_send_e_recv_grad_kernel.cc | 13 +++++++++++++ .../kernels/cpu/graph_send_e_recv_kernel.cc | 13 +++++++++++++ .../gpu/graph_send_e_recv_grad_kernel.cu | 13 +++++++++++++ .../kernels/gpu/graph_send_e_recv_kernel.cu | 13 +++++++++++++ .../kernels/graph_send_e_recv_grad_kernel.h | 18 ++++++++++++++++++ paddle/phi/kernels/graph_send_e_recv_kernel.h | 18 ++++++++++++++++++ 7 files changed, 106 insertions(+) create mode 100644 paddle/fluid/operators/graph_send_e_recv_op.cc create mode 100644 paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc create mode 100644 paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu create mode 100644 paddle/phi/kernels/graph_send_e_recv_grad_kernel.h create mode 100644 paddle/phi/kernels/graph_send_e_recv_kernel.h diff --git a/paddle/fluid/operators/graph_send_e_recv_op.cc b/paddle/fluid/operators/graph_send_e_recv_op.cc new file mode 100644 index 0000000000000..9953cec63fd4d --- /dev/null +++ b/paddle/fluid/operators/graph_send_e_recv_op.cc @@ -0,0 +1,18 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc new file mode 100644 index 0000000000000..0544a1e298b8e --- /dev/null +++ b/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc @@ -0,0 +1,13 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc new file mode 100644 index 0000000000000..0544a1e298b8e --- /dev/null +++ b/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc @@ -0,0 +1,13 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu new file mode 100644 index 0000000000000..0544a1e298b8e --- /dev/null +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -0,0 +1,13 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu new file mode 100644 index 0000000000000..0544a1e298b8e --- /dev/null +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu @@ -0,0 +1,13 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. diff --git a/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h new file mode 100644 index 0000000000000..9c6a49f91e63f --- /dev/null +++ b/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h @@ -0,0 +1,18 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/phi/kernels/graph_send_e_recv_kernel.h b/paddle/phi/kernels/graph_send_e_recv_kernel.h new file mode 100644 index 0000000000000..9c6a49f91e63f --- /dev/null +++ b/paddle/phi/kernels/graph_send_e_recv_kernel.h @@ -0,0 +1,18 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" From dd22ac2388458303917a19b35a244de840d3a2d7 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 12 May 2022 03:44:12 +0000 Subject: [PATCH 02/51] add op definition and infermeta --- .../fluid/operators/graph_send_e_recv_op.cc | 127 +++++++++++++++++- paddle/phi/infermeta/multiary.cc | 80 +++++++++++ paddle/phi/infermeta/multiary.h | 10 ++ 3 files changed, 216 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/graph_send_e_recv_op.cc b/paddle/fluid/operators/graph_send_e_recv_op.cc index 9953cec63fd4d..d23c8404e8f3b 100644 --- a/paddle/fluid/operators/graph_send_e_recv_op.cc +++ b/paddle/fluid/operators/graph_send_e_recv_op.cc @@ -15,4 +15,129 @@ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/ternary.h" +#include "paddle/phi/infermeta/multiary.h" + +namespace paddle { +namespace operators { + +class GraphSendERecvOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), + ctx.device_context()); + } +}; + +class GraphSendERecvGradOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto in_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(framework::GradVarName("X"), in_dims); + auto e_dims = ctx->GetInputDim("E"); + ctx->SetOutputDim(framework::GradVarName("E"), in_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.device_context()); + } +}; + +class GraphSendERecvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor with data type float32, float64, int32, int64."); + AddInput("E", + "The input edge weight tensor, data type should be same with X"); + AddInput("Src_index", "The source index tensor."); + AddInput("Dst_index", "The destination index tensor."); + AddOutput("Out", "Output tensor of graph_send_e_recv op."); + AddOutput("Dst_count", + "Count tensor of Dst_index, mainly for MEAN pool_type.") + .AsIntermediate(); + AddAttr("compute_type", + "(string, default 'ADD')" + "Define differenct computation types between X and E".) + .SetDefault("ADD") + .InEnum({"ADD", "SUB", "MUL", "DIV"}); + AddAttr("pool_type", + "(string, default 'SUM')" + "Define different pool types to receive the result " + "tensors of Dst_index.") + .SetDefault("SUM") + .InEnum({"SUM", "MEAN", "MIN", "MAX"}); + AddAttr( + "out_size", + "(int64_t, default 0)" + "Define the first dimension of Output tensor." + "If set default 0, then the shape of Out is the same with X.") + .SetDefault(0); + AddComment(R"DOC( +Graph Learning Send_E_Recv combine operator. + +$Out = Recv(Compute(Send(X, Src_index), E, compute_type), Dst_index, pool_type)$ + +This operator is mainly used in Graph Learning domain, and the main purpose is to reduce +intermediate memory consumption in the process of message passing. + +Take `X` as the input tensor, we first use `src_index` to gather corresponding data. +Then the gather data should compute with `E` in different compute_types, like add, sub, mul, and div, +and get the computation result. Then, use `dst_index` to update the corresponding position of output +tensor in different pooling types, like sum, mean, max, or min. + +)DOC"); + } +}; + +template +class GraphSendERecvGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("graph_send_e_recv_grad"); + op->SetInput("X", this->Input("X")); + op->SetInput("E", this->Input("E")); + op->SetInput("Src_index", this->Input("Src_index")); + op->SetInput("Dst_index", this->Input("Dst_index")); + + if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") { + op->SetInput("Dst_count", this->Output("Dst_count")); + } + + if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" || + BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") { + op->SetInput("Out", this->Output("Out")); + } + + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetAttrMap(this->Attrs()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(graph_send_e_recv, GraphSendERecvInferShapeFunctor, + PD_INFER_META(phi::GraphSendERecvInferMeta)); +REGISTER_OPERATOR(graph_send_e_recv, ops::GraphSendERecvOP, + ops::GraphSendERecvOpMaker, + ops::GraphSendERecvGradOpMaker, + ops::GraphSendERecvGradOpMaker, + GraphSendERecvInferShapeFunctor); +REGISTER_OPERATOR(graph_send_e_recv_grad, ops::GraphSendERecvGradOp); diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index e793eb8e66872..3a606845d81f9 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2310,6 +2310,86 @@ void Yolov3LossInferMeta(const MetaTensor& x, gt_match_mask->set_dtype(x.dtype()); } +void GraphSendERecvInferMeta(const MetaTensor& x, + const MetaTensor& e, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + MetaTensor* out, + MetaTensor* dst_count) { + auto src_index_dims = src_index.dims(); + if (src_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(src_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Src_index should be 1 when it " + "is 2D, but we get %d", + src_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + src_index_dims.size(), + 1, + phi::errors::InvalidArgument( + "The Src_index should be 1D, when it is not 2D, but we get %d", + src_index_dims.size())); + } + + auto dst_index_dims = dst_index.dims(); + if (dst_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(dst_index_dims[1], + 1, + phi::errors::InvalidArgument( + "The last dim of Dst_index should be 1 when it " + "is 2D, but we get %d", + dst_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + dst_index_dims.size(), + 1, + phi::errors::InvalidArgument("The Dst_index should be 1D, " + "when it is not 2D, but we get %d", + dst_index_dims.size())); + } + + PADDLE_ENFORCE_EQ(src_index_dims[0], + dst_index_dims[0], + phi::errors::InvalidArgument( + "Src_index and Dst_index should have the same shape.")); + + auto e_dims = e.dims(); + PADDLE_ENFORCE_EQ( + e_dims[0], + src_index_dims[0], + phi::errors::InvalidArgument( + "Expect Input E to have size %d on the first dimension, " + "but we get %d", + src_index_dims[0], + e_dims[0])); + + auto dims = x.dims(); + if (out_size <= 0) { + out->set_dims(dims); + } else { + std::vector dims_ = phi::vectorize(dims); + if (dims_.size() > 0) { + dims_[0] = out_size; + } + out->set_dims(phi::make_ddim(dims_)); + } + out->set_dtype(x.dtype()); + + if (pool_type == "MEAN") { + if (out_size <= 0) { + dst_count->set_dims({dims[0]}); + } else { + dst_count->set_dims({out_size}); + } + dst_count->set_dtype(DataType::INT32); + } +} + } // namespace phi PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 65b5819b602ba..edc6df4961b94 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -367,4 +367,14 @@ void Yolov3LossInferMeta(const MetaTensor& x, MetaTensor* objectness_mask, MetaTensor* gt_match_mask); +void GraphSendERecvInferMeta(const MetaTensor& x, + const MetaTensor& e, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + MetaTensor* out, + MetaTensor* dst_count); + } // namespace phi From a0938cb4cb08fa5c5937fd8b1379195e5550cbb0 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 12 May 2022 09:19:54 +0000 Subject: [PATCH 03/51] add kernel definition funcs --- .../kernels/graph_send_e_recv_grad_kernel.h | 18 ++++++++++++++++++ paddle/phi/kernels/graph_send_e_recv_kernel.h | 16 ++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h index 9c6a49f91e63f..8b5fa72f14f57 100644 --- a/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h +++ b/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h @@ -16,3 +16,21 @@ #include #include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void GraphSendERecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + paddle::optional out, + paddle::optional dst_count, + const DenseTensor& out_grad, + const std::string& compute_type, + const std::string& pool_type, + DenseTensor* x_grad, + DenseTensor* e_grad); +} // namespace phi diff --git a/paddle/phi/kernels/graph_send_e_recv_kernel.h b/paddle/phi/kernels/graph_send_e_recv_kernel.h index 9c6a49f91e63f..f460ab7b3cbe3 100644 --- a/paddle/phi/kernels/graph_send_e_recv_kernel.h +++ b/paddle/phi/kernels/graph_send_e_recv_kernel.h @@ -16,3 +16,19 @@ #include #include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GraphSendERecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + DenseTensor* out, + DenseTensor* dst_count); + +} // namespace phi From 1bc283ca198c2530e9dd43f441c27b6895a12ef4 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 17 May 2022 06:58:29 +0000 Subject: [PATCH 04/51] add broadcast infer shape --- paddle/phi/infermeta/multiary.cc | 44 ++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 3a606845d81f9..4c26491f11588 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" namespace phi { @@ -2368,26 +2369,43 @@ void GraphSendERecvInferMeta(const MetaTensor& x, src_index_dims[0], e_dims[0])); - auto dims = x.dims(); - if (out_size <= 0) { - out->set_dims(dims); - } else { - std::vector dims_ = phi::vectorize(dims); - if (dims_.size() > 0) { - dims_[0] = out_size; - } - out->set_dims(phi::make_ddim(dims_)); - } - out->set_dtype(x.dtype()); - + auto x_dims = x.dims(); if (pool_type == "MEAN") { if (out_size <= 0) { - dst_count->set_dims({dims[0]}); + dst_count->set_dims({x_dims[0]}); } else { dst_count->set_dims({out_size}); } dst_count->set_dtype(DataType::INT32); } + + // Infer out's shape according to x and e(need broadcasting condition) + out->set_dtype(x.dtype()); + // 先假设都要进行broadcast,后面再进行区分 + std::vector x_dims1 = phi::vectorize(x_dims); + std::vector e_dims1 = phi::vectorize(e_dims); + std::vector x_dims2(x_dims1.begin() + 1, x_dims1.end()); + std::vector e_dims2(e_dims1.begin() + 1, e_dims1.end()); + + int max_dim = std::max(x_dims2.size(), e_dims2.size()); + int axis = std::abs(x_dims2.size() - e_dims2.size()); + std::vector x_dims_array(max_dim); + std::vector e_dims_array(max_dim); + std::vector out_dims_array(max_dim); + // Only need to broadcast dimensions other than the 0th dimension. + GetBroadcastDimsArrays(phi::make_ddim(x_dims2), + phi::make_ddim(e_dims2), + x_dims_array.data(), + e_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + if (out_size <= 0) { + out_dims_array.insert(out_dims_array.begin(), x_dims[0]); + } else { + out_dims_array.insert(out_dims_array.begin(), out_size); + } + out->set_dims(phi::make_ddim(out_dims_array)); } } // namespace phi From c1d51a95bb1448f31ac901a46a5763056ca75c69 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 19 May 2022 11:57:57 +0000 Subject: [PATCH 05/51] add gpu forward kernel --- paddle/phi/infermeta/multiary.cc | 3 +- .../phi/kernels/gpu/graph_send_e_recv_funcs.h | 150 ++++++ .../kernels/gpu/graph_send_e_recv_kernel.cu | 439 ++++++++++++++++++ .../impl/graph_send_e_recv_kernel_impl.h | 84 ++++ 4 files changed, 674 insertions(+), 2 deletions(-) create mode 100644 paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h create mode 100644 paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 4c26491f11588..d506a35c967e3 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2364,7 +2364,7 @@ void GraphSendERecvInferMeta(const MetaTensor& x, e_dims[0], src_index_dims[0], phi::errors::InvalidArgument( - "Expect Input E to have size %d on the first dimension, " + "Expect Input E to have size %d as Src_index on the first dimension, " "but we get %d", src_index_dims[0], e_dims[0])); @@ -2381,7 +2381,6 @@ void GraphSendERecvInferMeta(const MetaTensor& x, // Infer out's shape according to x and e(need broadcasting condition) out->set_dtype(x.dtype()); - // 先假设都要进行broadcast,后面再进行区分 std::vector x_dims1 = phi::vectorize(x_dims); std::vector e_dims1 = phi::vectorize(e_dims); std::vector x_dims2(x_dims1.begin() + 1, x_dims1.end()); diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h new file mode 100644 index 0000000000000..305adbda0a26e --- /dev/null +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h @@ -0,0 +1,150 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/kernels/graph_send_e_recv_kernel.h" + +#include +#include +#include +#include + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" + +namespace phi { + +#define CUDA_MAX_NUM_THREADS 1024 + +inline int FindNumThreads(int dim, int max_num_threads = CUDA_MAX_NUM_THREADS) { + PADDLE_ENFORCE_GE(dim, + 0, + platform::errors::PreconditionNotMet( + "Required dim >= 0, but received dim = %d", dim)); + if (dim == 0) return 1; + int res = max_num_threads; + while (res > dim) { + res = res >> 1; + } + return res; +} + +template +struct GraphSendERecvSumCUDAFunctor { + DEVICE inline void operator()(T* output, T val) { + paddle::platform::CudaAtomicAdd(output, val); + } +}; + +template +struct GraphSendERecvMaxCUDAFunctor { + DEVICE inline void operator()(T* output, T val) { + paddle::platform::CudaAtomicMax(output, val); + } +}; + +template +struct GraphSendERecvMinCUDAFunctor { + DEVICE inline void operator()(T* output, val) { + paddle::platform::CudaAtomicMin(output, val); + } +}; + +template +__global__ void GraphSendERecvCUDAKernel(const T* x_data, + const T* e_data, + const IndexT* src_indices, + const IndexT* dst_indices, + const int64_t* xbcast_off, + const int64_t* ebcast_off, + T* output, + int64_t index_size, + int64_t x_len, + int64_t e_len, + int64_t out_len, + bool use_bcast, + ComputeFunctor cfunctor, + ReduceFunctor rfunctor) { + IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; + const IndexT stride_y = blockDim.y * gridDim.y; + + while (ty < index_size) { + IndexT src = src_indices[ty]; + IndexT dst = dst_indices[ty]; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t stride_x = blockDim.x * gridDim.x; + + const T* x_off = x_data + src * x_len; + const T* e_off = e_data + ty * e_len; + T* out_off = output + dst * out_len; + while (tx < out_len) { + int64_t x_add = use_bcast ? xbcast_off[tx] : tx; + int64_t e_add = use_bcast ? ebcast_off[tx] : tx; + T val = cfunctor(x_off + x_add, e_off + e_add); + rfunctor(out_off + tx, val); + tx += stride_x; + } + ty += stride_y; + } +} + +// For backward mean +template +__global__ void ManipulateMeanGradCUDAKernel(const T* params, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + size_t index_size, + size_t slice_size, + const int32_t* dst_count) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; + paddle::platform::CudaAtomicAdd(output + out_i, + *(params + in_i) / dst_count[src_i]); + } +} + +// For backward min and max +template +__global__ void ManipulateMinMaxGradCUDAKernel(const T* params, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + size_t index_size, + size_t slice_size, + const T* ptr_input, + const T* ptr_output) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; + paddle::platform::CudaAtomicAdd( + output + out_i, + *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu index 0544a1e298b8e..ee6180ff53674 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu @@ -11,3 +11,442 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +#include "paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h" +#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" +#include "paddle/phi/kernels/graph_send_e_recv_kernel.h" +#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" + +#include +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" + +namespace phi { + +template +void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + DenseTensor* out, + DenseTensor* dst_count = nullptr) { + const int& index_size = src_index.dims()[0]; + ctx.template Alloc(out); + T* out_data = out->data(); + auto out_dims = out->dims(); + int64_t memset_size = 1; + for (int i = 0; i < out_dims.size(); i++) { + memset_size *= out_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); + if (pool_type == "SUM" || pool_type == "MEAN") { +#ifdef PADDLE_WITH_HIP + hipMemset(out_data, 0, memset_bytes); +#else + cudaMemset(out_data, 0, memset_bytes); +#endif + } else if (pool_type == "MAX") { + thrust::device_ptr out_data_ptr(out_data); + thrust::fill(thrust::device, + out_data_ptr, + out_data_ptr + memset_size, + std::numeric_limits::min()); + } else if (pool_type == "MIN") { + thrust::device_ptr out_data_ptr(out_data); + thrust::fill(thrust::device, + out_data_ptr, + out_data_ptr + memset_size, + std::numeric_limits::max()); + } + + if (index_size == 0) return; + + const auto& bcast_info = CaclBCastInfo(x.dims(), e.dims()); + const T* x_data = x.data(); + const T* e_data = e.data(); + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + + thrust::device_vector x_bcastoff, e_bcastoff; + if (bcast_info.use_bcast) { + x_bcastoff.resize(bcast_info.out_len); + e_bcastoff.resize(bcast_info.out_len); + cudaMemcpy(thrust::raw_pointer_cast(x_bcastoff.data()), + bcast_info.x_offset.data(), + sizeof(int64_t) * bcast_info.out_len, + cudaMemcpyHostToDevice); + cudaMemcpy(thrust::raw_pointer_cast(e_bcastoff.data()), + bcast_info.e_offset.data(), + sizeof(int64_t) * bcast_info.out_len, + cudaMemcpyHostToDevice); + } + + int64_t out_len = bcast_info.out_len; + const int ntx = FindNumThreads(out_len); // 一个block包含的Thread数 + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nbx = (out_len + ntx - 1) / ntx; + const int nby = (index_size + nty - 1) / nty; + const dim3 grid(nbx, nby); + const dim3 block(ntx, nty); + int64_t* x_bcastoff_data = thrust::raw_pointer_cast(x_bcastoff.data()); + int64_t* e_bcastoff_data = trhust::raw_pointer_cast(e_bcastoff.data()); + int64_t input_size = x.dims()[0]; +#ifdef PADDLE_WITH_HIP + int block_ = 256; +#else + int block_ = 1024; +#endif + if (pool_type == "SUM" || pool_type == "MEAN") { + GraphSendERecvSumCUDAFunctor sum_functor; + if (compute_type == "ADD") { + AddFunctor add_funtor; + GraphSendERecvCUDAKernel, + AddFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + add_funtor, + sum_functor); + } else if (compute_type == "SUB") { + SubtractFunctor sub_functor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvSumCUDAFunctor, + SubtractFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + sub_functor, + sum_functor); + } else if (compute_type == "MUL") { + MultiplyFunctor mul_functor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvSumCUDAFunctor, + MultiplyFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + mul_functor, + sum_functor); + } else if (compute_type == "DIV") { + DivideFunctor div_functor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvSumCUDAFunctor, + DivideFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + div_functor, + sum_functor); + } + if (pool_type == "MEAN") { + ctx.template Alloc(dst_count); + int32_t* dst_count_data = dst_count->data(); + if (out_size > 0) { + input_size = out_size; + } +#ifdef PADDLE_WITH_HIP + hipMemset(dst_count_data, 0, input_size * sizeof(int)); +#else + cudaMemset(dst_count_data, 0, input_size * sizeof(int)); +#endif + int64_t grid_count = (index_size + block_ - 1) / block_; + ComputeCountCUDAKernel<<>>( + dst_count_data, d_index, index_size); + + int64_t grid_mean = (input_size * out_len + block_ - 1) / block_; + int64_t grid_mean_ = + grid_mean < max_grid_dimx ? grid_mean : max_grid_dimx; + ManipulateMeanCUDAKernel<<>>( + out_data, dst_count_data, input_size, out_len); + } + } else if (pool_type == "MAX") { + GraphSendERecvMaxCUDAFunctor max_functor; + if (compute_type == "ADD") { + AddFunctor add_funtor; + GraphSendERecvCUDAKernel, + AddFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + add_funtor, + max_functor); + } else if (compute_type == "SUB") { + SubtractFunctor sub_functor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvMaxCUDAFunctor, + SubtractFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + sub_functor, + max_functor); + } else if (compute_type == "MUL") { + MultiplyFunctor mul_functor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvMaxCUDAFunctor, + MultiplyFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + mul_functor, + max_functor); + } else if (compute_type == "DIV") { + DivideFunctor div_functor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvMaxCUDAFunctor, + DivideFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + div_functor, + max_functor); + } + if (out_size > 0) { + input_size = out_size; + } + int64_t grid_max = (input_size * out_len + block_ - 1) / block_; + int64_t grid_max_ = grid_max < max_grid_dimx ? grid_max : max_grid_dimx; + InputResetMaxCUDAKernel<<>>( + out_data, input_size, out_len); + } else if (pool_type == "MIN") { + GraphSendERecvMinCUDAFunctor min_functor; + if (compute_type == "ADD") { + AddFunctor add_funtor; + GraphSendERecvCUDAKernel, + AddFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + add_funtor, + min_functor); + } else if (compute_type == "SUB") { + SubtractFunctor sub_functor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvMinCUDAFunctor, + SubtractFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + sub_functor, + min_functor); + } else if (compute_type == "MUL") { + MultiplyFunctor mul_functor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvMinCUDAFunctor, + MultiplyFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + mul_functor, + min_functor); + } else if (compute_type == "DIV") { + DivideFunctor div_functor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvMinCUDAFunctor, + DivideFunctor><<>>( + x_data, + e_data, + s_index, + d_index, + x_bcastoff_data, + e_bcastoff_data, + out_data, + index_size, + bcast_info.x_len, + bcast_info.e_len, + out_len, + bcast_info.use_bcast, + div_functor, + min_functor); + } + if (out_size > 0) { + input_size = out_size; + } + int64_t grid_min = (input_size * out_len + block_ - 1) / block_; + int64_t grid_min_ = grid_min < max_grid_dimx ? grid_min : max_grid_dimx; + InputResetMinCUDAKernel<<>>( + out_data, input_size, out_len); + } +} + +template +void GraphSendERecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + DenseTensor* out, + DenseTensor* dst_count) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendERecvOpCUDAKernelLaunchHelper(ctx, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + out_size, + out, + dst_count); + } else if (index_type == phi::DataType::INT64) { + GraphSendERecvOpCUDAKernelLaunchHelper(ctx, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + out_size, + out, + dst_count); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_e_recv, + GPU, + ALL_LAYOUT, + phi::GraphSendERecvKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h new file mode 100644 index 0000000000000..1c8946ceebc4b --- /dev/null +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace phi { + +struct BroadCastInfo { + bool use_bcast; + // x_offset[i] indicates the start position of tensor x that required to + // compute the i-th element in output, so as e_offset[i]. + std::vector x_offset, e_offset; + int64_t x_len, e_len, out_len, reduce_size; +}; + +bool UseBroadCast(const phi::DDim& x_dims, const phi::DDim& e_dims) { + if (x_dims.size() != e_dims.size()) { + return True; + } + for (int i = 0; i < x_dims.size(); i++) { + if (x_dims[i] != e_dims[i]) { + return True; + } + } + return False; +} + +BroadCastInfo CaclBCastInfo(const phi::DDim& x_dims, const phi::DDim& e_dims) { + BroadCastInfo binfo; + binfo.use_bcast = UseBroadCast(x_dims, e_dims); + binfo.x_len = 1; + binfo.e_len = 1; + for (int i = 1; i < x_dims.size(); i++) { + binfo.x_len *= x_dims[i]; + } + for (int i = 1; i < e_dims.size(); i++) { + binfo.e_len *= e_dims[i]; + } + // TODO(daisiming): Whether to add dot. + binfo.reduce_size = 1; + if (binfo.use_bcast) { + const int max_dim = std::max(x_dims.size(), e_dims.size()) - 1; + int stride_x = 1, stride_e = 1; + binfo.x_offset.emplace_back(0); + binfo.e_offset.emplace_back(0); + int out_len = 1; + for (int i = 0; i < max_dim; i++) { + // Iterate the axis from back to front. + const int dl = + (x_dims.size() - 1 - i < 1) ? 1 : x_dims[x_dims.size() - 1 - i]; + const int dr = + (e_dims.size() - 1 - i < 1) ? 1 : e_dims[e_dims.size() - 1 - i]; + for (int j = 0; j < std::max(dl, dr); j++) { + for (int k = 0; k < out_len; k++) { + binfo.x_offset.emplace_back(binfo.x_offset[k] + + j * (j < dl) * stride_x); + binfo.e_offset.emplace_back(binfo.e_offset[k] + + j * (j < dr) * stride_e); + } + } + out_len *= std::max(dl, dr); + stride_x *= dl; + stride_e *= dr; + } + binfo.out_len = out_len; + } else { + binfo.out_len = binfo.x_len; + } + return binfo; +} + +} // namespace phi From 6e19153700cf8f3fd9c62c69a253c2dfcc070538 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 19 May 2022 12:10:21 +0000 Subject: [PATCH 06/51] delete SUB and DIV --- .../fluid/operators/graph_send_e_recv_op.cc | 2 +- .../kernels/gpu/graph_send_e_recv_kernel.cu | 126 ------------------ 2 files changed, 1 insertion(+), 127 deletions(-) diff --git a/paddle/fluid/operators/graph_send_e_recv_op.cc b/paddle/fluid/operators/graph_send_e_recv_op.cc index d23c8404e8f3b..6cdb6ca407266 100644 --- a/paddle/fluid/operators/graph_send_e_recv_op.cc +++ b/paddle/fluid/operators/graph_send_e_recv_op.cc @@ -70,7 +70,7 @@ class GraphSendERecvOpMaker : public framework::OpProtoAndCheckerMaker { "(string, default 'ADD')" "Define differenct computation types between X and E".) .SetDefault("ADD") - .InEnum({"ADD", "SUB", "MUL", "DIV"}); + .InEnum({"ADD", "MUL"}); AddAttr("pool_type", "(string, default 'SUM')" "Define different pool types to receive the result " diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu index ee6180ff53674..22d3c6546cbb2 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu @@ -128,27 +128,6 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, bcast_info.use_bcast, add_funtor, sum_functor); - } else if (compute_type == "SUB") { - SubtractFunctor sub_functor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvSumCUDAFunctor, - SubtractFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - x_bcastoff_data, - e_bcastoff_data, - out_data, - index_size, - bcast_info.x_len, - bcast_info.e_len, - out_len, - bcast_info.use_bcast, - sub_functor, - sum_functor); } else if (compute_type == "MUL") { MultiplyFunctor mul_functor; GraphSendERecvCUDAKernel< @@ -170,27 +149,6 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, bcast_info.use_bcast, mul_functor, sum_functor); - } else if (compute_type == "DIV") { - DivideFunctor div_functor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvSumCUDAFunctor, - DivideFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - x_bcastoff_data, - e_bcastoff_data, - out_data, - index_size, - bcast_info.x_len, - bcast_info.e_len, - out_len, - bcast_info.use_bcast, - div_functor, - sum_functor); } if (pool_type == "MEAN") { ctx.template Alloc(dst_count); @@ -236,27 +194,6 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, bcast_info.use_bcast, add_funtor, max_functor); - } else if (compute_type == "SUB") { - SubtractFunctor sub_functor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvMaxCUDAFunctor, - SubtractFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - x_bcastoff_data, - e_bcastoff_data, - out_data, - index_size, - bcast_info.x_len, - bcast_info.e_len, - out_len, - bcast_info.use_bcast, - sub_functor, - max_functor); } else if (compute_type == "MUL") { MultiplyFunctor mul_functor; GraphSendERecvCUDAKernel< @@ -278,27 +215,6 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, bcast_info.use_bcast, mul_functor, max_functor); - } else if (compute_type == "DIV") { - DivideFunctor div_functor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvMaxCUDAFunctor, - DivideFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - x_bcastoff_data, - e_bcastoff_data, - out_data, - index_size, - bcast_info.x_len, - bcast_info.e_len, - out_len, - bcast_info.use_bcast, - div_functor, - max_functor); } if (out_size > 0) { input_size = out_size; @@ -329,27 +245,6 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, bcast_info.use_bcast, add_funtor, min_functor); - } else if (compute_type == "SUB") { - SubtractFunctor sub_functor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvMinCUDAFunctor, - SubtractFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - x_bcastoff_data, - e_bcastoff_data, - out_data, - index_size, - bcast_info.x_len, - bcast_info.e_len, - out_len, - bcast_info.use_bcast, - sub_functor, - min_functor); } else if (compute_type == "MUL") { MultiplyFunctor mul_functor; GraphSendERecvCUDAKernel< @@ -371,27 +266,6 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, bcast_info.use_bcast, mul_functor, min_functor); - } else if (compute_type == "DIV") { - DivideFunctor div_functor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvMinCUDAFunctor, - DivideFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - x_bcastoff_data, - e_bcastoff_data, - out_data, - index_size, - bcast_info.x_len, - bcast_info.e_len, - out_len, - bcast_info.use_bcast, - div_functor, - min_functor); } if (out_size > 0) { input_size = out_size; From c14114bf6ecee1dc4135aa3b83dae96a95f89f3d Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 25 May 2022 05:17:25 +0000 Subject: [PATCH 07/51] add x_grad --- .../fluid/operators/graph_send_e_recv_op.cc | 2 +- .../phi/kernels/gpu/graph_send_e_recv_funcs.h | 169 +++++++--- .../gpu/graph_send_e_recv_grad_kernel.cu | 293 ++++++++++++++++++ .../kernels/gpu/graph_send_e_recv_kernel.cu | 63 ++-- .../impl/graph_send_e_recv_kernel_impl.h | 64 ++-- 5 files changed, 484 insertions(+), 107 deletions(-) diff --git a/paddle/fluid/operators/graph_send_e_recv_op.cc b/paddle/fluid/operators/graph_send_e_recv_op.cc index 6cdb6ca407266..262c1b5994b50 100644 --- a/paddle/fluid/operators/graph_send_e_recv_op.cc +++ b/paddle/fluid/operators/graph_send_e_recv_op.cc @@ -41,7 +41,7 @@ class GraphSendERecvGradOP : public framework::OperatorWithKernel { auto in_dims = ctx->GetInputDim("X"); ctx->SetOutputDim(framework::GradVarName("X"), in_dims); auto e_dims = ctx->GetInputDim("E"); - ctx->SetOutputDim(framework::GradVarName("E"), in_dims); + ctx->SetOutputDim(framework::GradVarName("E"), e_dims); } protected: diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h index 305adbda0a26e..31921e31e72c1 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h @@ -28,6 +28,21 @@ namespace phi { #define CUDA_MAX_NUM_THREADS 1024 +void CopyBCastOff(const BroadCastInfo& bcast_info, + thrust::device_vector& l_bcastoff, + thrust::device_vector& r_bcastoff) { + l_bcastoff.resize(bcast_info.out_len); + r_bcastoff.resize(bcast_info.out_len); + cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff.data()), + bcast_info.l_offset.data(), + sizeof(int64_t) * bcast_info.out_len, + cudaMemcpyHostToDevice); + cudaMemcpy(thrust::raw_pointer_cast(r_bcastoff.data()), + bcast_info.r_offset.data(), + sizeof(int64_t) * bcast_info.out_len, + cudaMemcpyHostToDevice); +} + inline int FindNumThreads(int dim, int max_num_threads = CUDA_MAX_NUM_THREADS) { PADDLE_ENFORCE_GE(dim, 0, @@ -95,7 +110,7 @@ __global__ void GraphSendERecvCUDAKernel(const T* x_data, while (tx < out_len) { int64_t x_add = use_bcast ? xbcast_off[tx] : tx; int64_t e_add = use_bcast ? ebcast_off[tx] : tx; - T val = cfunctor(x_off + x_add, e_off + e_add); + T val = cfunctor(x_off[x_add], e_off[e_add]); rfunctor(out_off + tx, val); tx += stride_x; } @@ -103,47 +118,127 @@ __global__ void GraphSendERecvCUDAKernel(const T* x_data, } } -// For backward mean +// x_grad: for backward mean template -__global__ void ManipulateMeanGradCUDAKernel(const T* params, - const IndexT* src_indices, - const IndexT* dst_indices, - T* output, - size_t index_size, - size_t slice_size, - const int32_t* dst_count) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd(output + out_i, - *(params + in_i) / dst_count[src_i]); +__global__ void ManipulateMeanGradCUDAKernelV2(const T* x_data, + const T* e_data, + const IndexT* src_indices, + const IndexT* dst_indices, + const int* dst_count, + const int64_t* xbcast_off, + const int64_t* ebcast_off, + T* x_grad, + int64_t index_size, + int64_t x_len, + int64_t e_len, + int64_t out_len, + bool use_bcast) { + IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; + const IndexT stride_y = blockDim.y * gridDim.y; + + while (ty < index_size) { + IndexT src = src_indices[ty]; + IndexT dst = dst_indices[ty]; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t stride_x = blockDim.x * gridDim.x; + + const T* x_off = x_data + src * x_len; + const T* e_off = e_data + ty * e_len; + T* x_grad_off = x_grad + dst * out_len; + while (tx < out_len) { + int64_t x_add = use_bcast ? xbcast_off[tx] : tx; + int64_t e_add = use_bcast ? ebcast_off[tx] : tx; + T val = x_off[x_add] * e_off[e_add]; + paddle::platform::CudaAtomicAdd(x_grad_off + tx, val / dst_count[src]); + tx += stride_x; + } + ty += stride_y; } } -// For backward min and max +// x_grad: backward min and max for add. template -__global__ void ManipulateMinMaxGradCUDAKernel(const T* params, - const IndexT* src_indices, - const IndexT* dst_indices, - T* output, - size_t index_size, - size_t slice_size, - const T* ptr_input, - const T* ptr_output) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd( - output + out_i, - *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); +__global__ void ManipulateMinMaxGradCUDAKernelForAdd(const T* x_data, + const T* e_data, + const T* out, + const T* out_grad, + const IndexT* src_indices, + const IndexT* dst_indices, + const int64_t* xbcast_off, + const int64_t* ebcast_off, + T* x_grad, + int64_t index_size, + int64_t x_len, + int64_t e_len, + int64_t out_len, + bool use_bcast) { + IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; + const IndexT stride_y = blockDim.y * gridDim.y; + + while (ty < index_size) { + IndexT src = src_indices[ty]; + IndexT dst = dst_indices[ty]; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t stride_x = blockDim.x * gridDim.x; + + const T* x_off = x_data + dst * x_len; + const T* e_off = e_data + ty * e_len; + const T* out_off = out + src * out_len; + const T* out_grad_off = out_grad + src * out_len; + T* x_grad_off = x_grad + dst * x_len; + while (tx < out_len) { + int64_t x_add = use_bcast ? xbcast_off[tx] : tx; + int64_t e_add = use_bcast ? ebcast_off[tx] : tx; + T val = x_off[x_add] + e_off[e_add]; + paddle::platform::CudaAtomicAdd(x_grad_off + x_add, + out_grad_off[tx] * (val == out_off[tx])); + tx += stride_x; + } + ty += stride_y; + } +} + +// x_grad: backward min and max for mul. +// 后续maxmin的处理函数也可以用来处理e的反向梯度 +template +__global__ void ManipulateMinMaxGradCUDAKernelForMul(const T* x_data, + const T* e_data, + const T* out, + const T* out_grad, + const IndexT* src_indices, + const IndexT* dst_indices, + const int64_t* xbcast_off, + const int64_t* ebcast_off, + T* x_grad, + int64_t index_size, + int64_t x_len, + int64_t e_len, + int64_t out_len, + bool use_bcast) { + IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; + const IndexT stride_y = blockDim.y * gridDim.y; + + while (ty < index_size) { + IndexT src = src_indices[ty]; + IndexT dst = dst_indices[ty]; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t stride_x = blockDim.x * gridDim.x; + + const T* x_off = x_data + dst * x_len; + const T* e_off = e_data + ty * e_len; + const T* out_off = out + src * out_len; + const T* out_grad_off = out_grad + src * out_len; + T* x_grad_off = x_grad + dst * x_len; + while (tx < out_len) { + int64_t x_add = use_bcast ? xbcast_off[tx] : tx; + int64_t e_add = use_bcast ? xbcast_off[tx] : tx; + T val = x_off[x_add] * e_off[e_add]; + paddle::platform::CudaAtomicAdd( + x_grad_off + x_add, + out_grad_off[tx] * (val == out_off[tx]) * e_off[e_add]); + tx += stride_x; + } + ty += stride_y; } } diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 0544a1e298b8e..7c6f51f0a976a 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -11,3 +11,296 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +#include "paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h" +#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" +#include "paddle/phi/kernels/graph_send_e_recv_grad_kernel.h" +#include "paddle/phi/kernels/impl/graph_send_e_recv_funcs.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void CalculateXGrad(const Context& ctx, + const T* out_grad, + const T* x_data, + const T* e_data, + const phi::DDim& out_grad_dims, + const phi::DDim& x_dims, + const phi::DDim& e_dims, + const IndexT* s_index, + const IndexT* d_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t index_size, + int64_t slice_size, + T* x_grad, + const DenseTensor* dst_count = nullptr, + const DenseTensor* out = nullptr) { +#ifdef PADDLE_WITH_HIP + int block = 256; +#else + int block = 1024; +#endif + int64_t n = slice_size * index_size; + int max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; + int64_t grid_tmp = (n + block - 1) / block; + int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dim; + if (pool_type == "SUM") { + if (compute_type == "ADD") { + GraphSendRecvSumCUDAFunctor functor; + GraphSendRecvCUDAKernel><<>>( + out_grad, d_index, s_index, x_grad, index_size, slice_size, functor); + } else if (compute_type == "MUL") { + const auto& bcast_info = CalcBCastInfo(out_grad_dims, e_dims); + thrust::device_vector l_bcastoff, r_bcastoff; + if (bcast_info.use_bcast) { + CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); + } + int64_t out_len = bcast_info.out_len; + const int ntx = FindNumThreads(out_len); + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nbx = (out_len + ntx - 1) / ntx; + const int nby = (index_size + nty - 1) / nty; + const dim3 grid_(nbx, nby); + const dim3 block_(ntx, nty); + MultiplyFunctor mul_functor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvSumCUDAFunctor, + MultiplyFunctor><<>>( + out_grad, + e_data, + d_index, + s_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast, + mul_functor, + sum_functor); + } + } else if (pool_type == "MEAN") { + const int* s_count = dst_count->data(); + if (compute_type == "ADD") { + ManipulateMeanGradCUDAKernel<<>>( + out_grad, d_index, s_index, x_grad, index_size, slice_size, s_count); + } else if (compute_type == "MUL") { + const auto& bcast_info = CalcBCastInfo(out_grad_dims, e_dims); + thrust::device_vector l_bcastoff, r_bcastoff; + if (bcast_info.use_bcast) { + CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); + } + int64_t out_len = bcast_info.out_len; + const int ntx = FindNumThreads(out_len); + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nbx = (out_len + ntx - 1) / ntx; + const int nby = (index_size + nty - 1) / nty; + const dim3 grid_(nbx, nby); + const dim3 block_(ntx, nty); + ManipulateMeanGradCUDAKernelV2< + T, + IndexT><<>>( + out_grad, + e_data, + d_index, + s_index, + s_count, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); + } + } else if (pool_type == "MAX" || pool_type == "MIN") { + const T* out_data = out->data(); + const auto& bcast_info = CalcBCastInfo(x_dims, e_dims); + thrust::device_vector l_bcastoff, r_bcastoff; + if (bcast_info.use_bcast) { + CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); + } + int64_t out_len = bcast_info.out_len; + const int ntx = FindNumThreads(out_len); + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nbx = (out_len + ntx - 1) / ntx; + const int nby = (index_size + nty - 1) / nty; + const dim3 grid_(nbx, nby); + const dim3 block_(ntx, nty); + if (compute_type == "ADD") { + ManipulateMinMaxGradCUDAKernelForAdd< + T, + IndexT><<>>( + x_data, + e_data, + out_data, + out_grad, + d_index, + s_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); + } else if (compute_type == "MUL") { + ManipulateMinMaxGradCUDAKernelForMul< + T, + IndexT><<>>( + x_data, + e_data, + out_data, + out_grad, + d_index, + s_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); + } + } +} + +template +void GraphSendERecvGradOpCUDAKernelLaunchHelper( + const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + DenseTensor* x_grad, + DenseTensor* e_grad, + const DenseTensor* dst_count = nullptr, + const DenseTensor* out = nullptr) { + const int& index_size = dst_index.dims()[0]; + + ctx.template Alloc(x_grad); + T* x_grad_data = x_grad->data(); + ctx.template Alloc(e_grad); + T* e_grad_data = e_grad->data(); + const auto& x_dims = x.dims(); + const auto& e_dims = e.dims(); + int64_t memset_size_x = 1, memset_size_e = 1; + int64_t slice_size = 1; + for (int i = 0; i < x_dims.size(); i++) { + memset_size_x *= x_dims[i]; + if (i > 0) slice_size *= x_dims[i]; + } + for (int i = 0; i < e_dims.size(); i++) { + memset_size_e *= e_dims[i]; + } + const size_t& memset_bytes_x = memset_size_x * sizeof(T); + const size_t& memset_bytes_e = memset_size_e * sizeof(T); +#ifdef PADDLE_WITH_HIP + hipMemset(x_grad_data, 0, memset_bytes_x); + hipMemset(e_grad_data, 0, memset_bytes_e); +#else + cudaMemset(x_grad_data, 0, memset_bytes_x); + cudaMemset(e_grad_data, 0, memset_bytes_e); +#endif + + if (index_size == 0) return; + + const T* out_grad_data = out_grad.data(); + const T* x_data = x.data(); + const T* e_data = e.data(); + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + + // Calculate x_grad + CalculateXGrad(ctx, + out_grad_data, + x_data, + e_data, + out_grad.dims(), + x_dims, + e_dims, + s_index, + d_index, + compute_type, + pool_type, + index_size, + slice_size, + x_grad_data, + dst_count, + out); +} + +template +void GraphSendERecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + paddle::optional out, + paddle::optional dst_count, + const DenseTensor& out_grad, + const std::string& compute_type, + const std::string& pool_type, + DenseTensor* x_grad, + DenseTensor* e_grad) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendERecvGradOpCUDAKernelLaunchHelper( + ctx, + out_grad, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + x_grad, + e_grad, + dst_count.get_ptr(), + out.get_ptr()); + } else if (index_type == phi::DataType::INT64) { + GraphSendERecvGradOpCUDAKernelLaunchHelper( + ctx, + out_grad, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + x_grad, + e_grad, + dst_count.get_ptr(), + out.get_ptr()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_e_recv_grad, + GPU, + ALL_LAYOUT, + phi::GraphSendERecvGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu index 22d3c6546cbb2..83e7931a0bddc 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu @@ -71,7 +71,7 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, if (index_size == 0) return; - const auto& bcast_info = CaclBCastInfo(x.dims(), e.dims()); + const auto& bcast_info = CalcBCastInfo(x.dims(), e.dims()); const T* x_data = x.data(); const T* e_data = e.data(); const IndexT* s_index = src_index.data(); @@ -79,16 +79,7 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, thrust::device_vector x_bcastoff, e_bcastoff; if (bcast_info.use_bcast) { - x_bcastoff.resize(bcast_info.out_len); - e_bcastoff.resize(bcast_info.out_len); - cudaMemcpy(thrust::raw_pointer_cast(x_bcastoff.data()), - bcast_info.x_offset.data(), - sizeof(int64_t) * bcast_info.out_len, - cudaMemcpyHostToDevice); - cudaMemcpy(thrust::raw_pointer_cast(e_bcastoff.data()), - bcast_info.e_offset.data(), - sizeof(int64_t) * bcast_info.out_len, - cudaMemcpyHostToDevice); + CopyBCastOff(bcast_info, x_bcastoff, e_bcastoff); } int64_t out_len = bcast_info.out_len; @@ -98,8 +89,6 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, const int nby = (index_size + nty - 1) / nty; const dim3 grid(nbx, nby); const dim3 block(ntx, nty); - int64_t* x_bcastoff_data = thrust::raw_pointer_cast(x_bcastoff.data()); - int64_t* e_bcastoff_data = trhust::raw_pointer_cast(e_bcastoff.data()); int64_t input_size = x.dims()[0]; #ifdef PADDLE_WITH_HIP int block_ = 256; @@ -118,12 +107,12 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, e_data, s_index, d_index, - x_bcastoff_data, - e_bcastoff_data, + thrust::raw_pointer_cast(x_bcastoff.data()); + thrust::raw_pointer_cast(e_bcastoff.data()); out_data, index_size, - bcast_info.x_len, - bcast_info.e_len, + bcast_info.l_len, + bcast_info.r_len, out_len, bcast_info.use_bcast, add_funtor, @@ -139,12 +128,12 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, e_data, s_index, d_index, - x_bcastoff_data, - e_bcastoff_data, + thrust::raw_pointer_cast(x_bcastoff.data()); + thrust::raw_pointer_cast(e_bcastoff.data()); out_data, index_size, - bcast_info.x_len, - bcast_info.e_len, + bcast_info.l_len, + bcast_info.r_len, out_len, bcast_info.use_bcast, mul_functor, @@ -184,12 +173,12 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, e_data, s_index, d_index, - x_bcastoff_data, - e_bcastoff_data, + thrust::raw_pointer_cast(x_bcastoff.data()); + thrust::raw_pointer_cast(e_bcastoff.data()); out_data, index_size, - bcast_info.x_len, - bcast_info.e_len, + bcast_info.l_len, + bcast_info.r_len, out_len, bcast_info.use_bcast, add_funtor, @@ -205,12 +194,12 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, e_data, s_index, d_index, - x_bcastoff_data, - e_bcastoff_data, + thrust::raw_pointer_cast(x_bcastoff.data()); + thrust::raw_pointer_cast(e_bcastoff.data()); out_data, index_size, - bcast_info.x_len, - bcast_info.e_len, + bcast_info.l_len, + bcast_info.r_len, out_len, bcast_info.use_bcast, mul_functor, @@ -235,12 +224,12 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, e_data, s_index, d_index, - x_bcastoff_data, - e_bcastoff_data, + thrust::raw_pointer_cast(x_bcastoff.data()); + thrust::raw_pointer_cast(e_bcastoff.data()); out_data, index_size, - bcast_info.x_len, - bcast_info.e_len, + bcast_info.l_len, + bcast_info.r_len, out_len, bcast_info.use_bcast, add_funtor, @@ -256,12 +245,12 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, e_data, s_index, d_index, - x_bcastoff_data, - e_bcastoff_data, + thrust::raw_pointer_cast(x_bcastoff.data()); + thrust::raw_pointer_cast(e_bcastoff.data()); out_data, index_size, - bcast_info.x_len, - bcast_info.e_len, + bcast_info.l_len, + bcast_info.r_len, out_len, bcast_info.use_bcast, mul_functor, diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h index 1c8946ceebc4b..322fa5bb333b8 100644 --- a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h @@ -19,64 +19,64 @@ namespace phi { struct BroadCastInfo { bool use_bcast; - // x_offset[i] indicates the start position of tensor x that required to - // compute the i-th element in output, so as e_offset[i]. - std::vector x_offset, e_offset; - int64_t x_len, e_len, out_len, reduce_size; + // l_offset[i] indicates the start position of tensor lhs that required to + // compute the i-th element in output, so as r_offset[i]. + std::vector l_offset, r_offset; + int64_t l_len, r_len, out_len, reduce_size; }; -bool UseBroadCast(const phi::DDim& x_dims, const phi::DDim& e_dims) { - if (x_dims.size() != e_dims.size()) { - return True; +bool UseBroadCast(const phi::DDim& l_dims, const phi::DDim& r_dims) { + if (l_dims.size() != r_dims.size()) { + return true; } - for (int i = 0; i < x_dims.size(); i++) { - if (x_dims[i] != e_dims[i]) { - return True; + for (int i = 1; i < l_dims.size(); i++) { + if (l_dims[i] != r_dims[i]) { + return true; } } - return False; + return false; } -BroadCastInfo CaclBCastInfo(const phi::DDim& x_dims, const phi::DDim& e_dims) { +BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims, const phi::DDim& r_dims) { BroadCastInfo binfo; - binfo.use_bcast = UseBroadCast(x_dims, e_dims); - binfo.x_len = 1; - binfo.e_len = 1; - for (int i = 1; i < x_dims.size(); i++) { - binfo.x_len *= x_dims[i]; + binfo.use_bcast = UseBroadCast(l_dims, r_dims); + binfo.l_len = 1; + binfo.r_len = 1; + for (int i = 1; i < l_dims.size(); i++) { + binfo.l_len *= l_dims[i]; } - for (int i = 1; i < e_dims.size(); i++) { - binfo.e_len *= e_dims[i]; + for (int i = 1; i < r_dims.size(); i++) { + binfo.r_len *= r_dims[i]; } // TODO(daisiming): Whether to add dot. binfo.reduce_size = 1; if (binfo.use_bcast) { - const int max_dim = std::max(x_dims.size(), e_dims.size()) - 1; - int stride_x = 1, stride_e = 1; - binfo.x_offset.emplace_back(0); - binfo.e_offset.emplace_back(0); + const int max_dim = std::max(l_dims.size(), r_dims.size()) - 1; + int stride_l = 1, stride_r = 1; + binfo.l_offset.emplace_back(0); + binfo.r_offset.emplace_back(0); int out_len = 1; for (int i = 0; i < max_dim; i++) { // Iterate the axis from back to front. const int dl = - (x_dims.size() - 1 - i < 1) ? 1 : x_dims[x_dims.size() - 1 - i]; + (l_dims.size() - 1 - i < 1) ? 1 : l_dims[l_dims.size() - 1 - i]; const int dr = - (e_dims.size() - 1 - i < 1) ? 1 : e_dims[e_dims.size() - 1 - i]; + (r_dims.size() - 1 - i < 1) ? 1 : r_dims[r_dims.size() - 1 - i]; for (int j = 0; j < std::max(dl, dr); j++) { for (int k = 0; k < out_len; k++) { - binfo.x_offset.emplace_back(binfo.x_offset[k] + - j * (j < dl) * stride_x); - binfo.e_offset.emplace_back(binfo.e_offset[k] + - j * (j < dr) * stride_e); + binfo.l_offset.emplace_back(binfo.l_offset[k] + + j * (j < dl) * stride_l); + binfo.r_offset.emplace_back(binfo.r_offset[k] + + j * (j < dr) * stride_r); } } out_len *= std::max(dl, dr); - stride_x *= dl; - stride_e *= dr; + stride_l *= dl; + stride_r *= dr; } binfo.out_len = out_len; } else { - binfo.out_len = binfo.x_len; + binfo.out_len = binfo.l_len; } return binfo; } From 4bf648004cd6f987819caada5131ddce27537602 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 25 May 2022 06:23:50 +0000 Subject: [PATCH 08/51] add template --- .../gpu/graph_send_e_recv_grad_kernel.cu | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 7c6f51f0a976a..0b250f6add9e8 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -231,22 +231,22 @@ void GraphSendERecvGradOpCUDAKernelLaunchHelper( const IndexT* d_index = dst_index.data(); // Calculate x_grad - CalculateXGrad(ctx, - out_grad_data, - x_data, - e_data, - out_grad.dims(), - x_dims, - e_dims, - s_index, - d_index, - compute_type, - pool_type, - index_size, - slice_size, - x_grad_data, - dst_count, - out); + CalculateXGrad(ctx, + out_grad_data, + x_data, + e_data, + out_grad.dims(), + x_dims, + e_dims, + s_index, + d_index, + compute_type, + pool_type, + index_size, + slice_size, + x_grad_data, + dst_count, + out); } template From 94702d45db1166fc0202cab950f231e789f64c47 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 25 May 2022 07:33:18 +0000 Subject: [PATCH 09/51] add e_grad for min and max --- .../phi/kernels/gpu/graph_send_e_recv_funcs.h | 9 + .../gpu/graph_send_e_recv_grad_kernel.cu | 177 +++++++++++------- 2 files changed, 118 insertions(+), 68 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h index 31921e31e72c1..f45362cd13704 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h @@ -167,6 +167,7 @@ __global__ void ManipulateMinMaxGradCUDAKernelForAdd(const T* x_data, const int64_t* xbcast_off, const int64_t* ebcast_off, T* x_grad, + T* e_grad, int64_t index_size, int64_t x_len, int64_t e_len, @@ -186,12 +187,15 @@ __global__ void ManipulateMinMaxGradCUDAKernelForAdd(const T* x_data, const T* out_off = out + src * out_len; const T* out_grad_off = out_grad + src * out_len; T* x_grad_off = x_grad + dst * x_len; + T* e_grad_off = e_grad + ty * e_len; while (tx < out_len) { int64_t x_add = use_bcast ? xbcast_off[tx] : tx; int64_t e_add = use_bcast ? ebcast_off[tx] : tx; T val = x_off[x_add] + e_off[e_add]; paddle::platform::CudaAtomicAdd(x_grad_off + x_add, out_grad_off[tx] * (val == out_off[tx])); + paddle::platform::CudaAtomicAdd(e_grad_off + e_add, + out_grad_off[tx] * (val == out_off[tx])); tx += stride_x; } ty += stride_y; @@ -210,6 +214,7 @@ __global__ void ManipulateMinMaxGradCUDAKernelForMul(const T* x_data, const int64_t* xbcast_off, const int64_t* ebcast_off, T* x_grad, + T* e_grad, int64_t index_size, int64_t x_len, int64_t e_len, @@ -229,6 +234,7 @@ __global__ void ManipulateMinMaxGradCUDAKernelForMul(const T* x_data, const T* out_off = out + src * out_len; const T* out_grad_off = out_grad + src * out_len; T* x_grad_off = x_grad + dst * x_len; + T* e_grad_off = e_grad + ty * e_len; while (tx < out_len) { int64_t x_add = use_bcast ? xbcast_off[tx] : tx; int64_t e_add = use_bcast ? xbcast_off[tx] : tx; @@ -236,6 +242,9 @@ __global__ void ManipulateMinMaxGradCUDAKernelForMul(const T* x_data, paddle::platform::CudaAtomicAdd( x_grad_off + x_add, out_grad_off[tx] * (val == out_off[tx]) * e_off[e_add]); + paddle::platform::CudaAtomicAdd( + e_grad_off + e_add, + out_grad_off[tx] * (val == out_off[tx]) * x_off[x_add]); tx += stride_x; } ty += stride_y; diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 0b250f6add9e8..870baa8185b4f 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -23,6 +23,79 @@ namespace phi { +template +void CalculateXEGradForMinMax(const Context& ctx, + const T* out_grad, + const T* x_data, + const T* e_data, + const phi::DDim& out_grad_dims, + const phi::DDim& x_dims, + const phi::DDim& e_dims, + const IndexT* s_index, + const IndexT* d_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t index_size, + int64_t slice_size, + T* x_grad, + T* e_grad, + const DenseTensor* out = nullptr) { + const T* out_data = out->data(); + const auto& bcast_info = CalcBCastInfo(x_dims, e_dims); + thrust::device_vector l_bcastoff, r_bcastoff; + if (bcast_info.use_bcast) { + CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); + } + + int64_t out_len = bcast_info.out_len; + const int ntx = FindNumThreads(out_len); + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nbx = (out_len + ntx - 1) / ntx; + const int nby = (index_size + nty - 1) / nty; + const dim3 grid(nbx, nby); + const dim3 block(ntx, nty); + + if (compute_type == "ADD") { + ManipulateMinMaxGradCUDAKernelForAdd< + T, + IndexT><<>>( + x_data, + e_data, + out_data, + out_grad, + d_index, + s_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + e_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); + } else if (compute_type == "MUL") { + ManipulateMinMaxGradCUDAKernelForMul< + T, + IndexT><<>>( + x_data, + e_data, + out_data, + out_grad, + d_index, + s_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + e_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); + } +} + template void CalculateXGrad(const Context& ctx, const T* out_grad, @@ -127,57 +200,6 @@ void CalculateXGrad(const Context& ctx, out_len, bcast_info.use_bcast); } - } else if (pool_type == "MAX" || pool_type == "MIN") { - const T* out_data = out->data(); - const auto& bcast_info = CalcBCastInfo(x_dims, e_dims); - thrust::device_vector l_bcastoff, r_bcastoff; - if (bcast_info.use_bcast) { - CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); - } - int64_t out_len = bcast_info.out_len; - const int ntx = FindNumThreads(out_len); - const int nty = CUDA_MAX_NUM_THREADS / ntx; - const int nbx = (out_len + ntx - 1) / ntx; - const int nby = (index_size + nty - 1) / nty; - const dim3 grid_(nbx, nby); - const dim3 block_(ntx, nty); - if (compute_type == "ADD") { - ManipulateMinMaxGradCUDAKernelForAdd< - T, - IndexT><<>>( - x_data, - e_data, - out_data, - out_grad, - d_index, - s_index, - thrust::raw_pointer_cast(l_bcastoff.data()), - thrust::raw_pointer_cast(r_bcastoff.data()), - x_grad, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast); - } else if (compute_type == "MUL") { - ManipulateMinMaxGradCUDAKernelForMul< - T, - IndexT><<>>( - x_data, - e_data, - out_data, - out_grad, - d_index, - s_index, - thrust::raw_pointer_cast(l_bcastoff.data()), - thrust::raw_pointer_cast(r_bcastoff.data()), - x_grad, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast); - } } } @@ -230,23 +252,42 @@ void GraphSendERecvGradOpCUDAKernelLaunchHelper( const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index.data(); - // Calculate x_grad - CalculateXGrad(ctx, - out_grad_data, - x_data, - e_data, - out_grad.dims(), - x_dims, - e_dims, - s_index, - d_index, - compute_type, - pool_type, - index_size, - slice_size, - x_grad_data, - dst_count, - out); + if (pool_type == "SUM" || pool_type == "MEAN") { + CalculateXGrad(ctx, + out_grad_data, + x_data, + e_data, + out_grad.dims(), + x_dims, + e_dims, + s_index, + d_index, + compute_type, + pool_type, + index_size, + slice_size, + x_grad_data, + dst_count, + out); + CalculateEGrad(); + } else if (pool_type == "MIN" || pool_type == "MAX") { + CalculateXEGradForMinMax(ctx, + out_grad_data, + x_data, + e_data, + out_grad.dims(), + x_dims, + e_dims, + s_index, + d_index, + compute_type, + pool_type, + index_size, + slice_size, + x_grad_data, + e_grad_data, + out); + } } template From 0d82c54313366d13c660e963af1b4fd8253e6c26 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 25 May 2022 08:12:01 +0000 Subject: [PATCH 10/51] fix small bug --- paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 870baa8185b4f..b0a98fa7db754 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -77,7 +77,7 @@ void CalculateXEGradForMinMax(const Context& ctx, } else if (compute_type == "MUL") { ManipulateMinMaxGradCUDAKernelForMul< T, - IndexT><<>>( + IndexT><<>>( x_data, e_data, out_data, From 40e3fc4e4893847ce59697a701712b7bf522d394 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 25 May 2022 12:01:44 +0000 Subject: [PATCH 11/51] temp commit --- .../phi/kernels/gpu/graph_send_e_recv_funcs.h | 46 ++++++++++++------- .../gpu/graph_send_e_recv_grad_kernel.cu | 20 +++++++- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h index f45362cd13704..bb359bb00b5f6 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h @@ -118,21 +118,21 @@ __global__ void GraphSendERecvCUDAKernel(const T* x_data, } } -// x_grad: for backward mean +// x_grad: for backward mean with mul. template -__global__ void ManipulateMeanGradCUDAKernelV2(const T* x_data, - const T* e_data, - const IndexT* src_indices, - const IndexT* dst_indices, - const int* dst_count, - const int64_t* xbcast_off, - const int64_t* ebcast_off, - T* x_grad, - int64_t index_size, - int64_t x_len, - int64_t e_len, - int64_t out_len, - bool use_bcast) { +__global__ void ManipulateMeanGradCUDAKernelForMulX(const T* x_data, + const T* e_data, + const IndexT* src_indices, + const IndexT* dst_indices, + const int* dst_count, + const int64_t* xbcast_off, + const int64_t* ebcast_off, + T* x_grad, + int64_t index_size, + int64_t x_len, + int64_t e_len, + int64_t out_len, + bool use_bcast) { IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; const IndexT stride_y = blockDim.y * gridDim.y; @@ -156,7 +156,20 @@ __global__ void ManipulateMeanGradCUDAKernelV2(const T* x_data, } } -// x_grad: backward min and max for add. +// e_grad: backward sum for add. +template +__global__ void ManipulateSumGradCUDAKernelForAddE() {} + +// e_grad: backward sum for mul. +__global__ void ManipulateSumGradCUDAKernelForMulE() {} + +// e_grad: backward mean for add +__global__ void ManipulateMeanGradCUDAKernelForAddE() {} + +// e_grad: backward mean for mul. +__global__ void ManipulateMeanGradCUDAKernelForMulE() {} + +// x_grad, e_grad: backward min and max for add. template __global__ void ManipulateMinMaxGradCUDAKernelForAdd(const T* x_data, const T* e_data, @@ -202,8 +215,7 @@ __global__ void ManipulateMinMaxGradCUDAKernelForAdd(const T* x_data, } } -// x_grad: backward min and max for mul. -// 后续maxmin的处理函数也可以用来处理e的反向梯度 +// x_grad, e_grad: backward min and max for mul. template __global__ void ManipulateMinMaxGradCUDAKernelForMul(const T* x_data, const T* e_data, diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index b0a98fa7db754..8f2540af24ae8 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -183,7 +183,7 @@ void CalculateXGrad(const Context& ctx, const int nby = (index_size + nty - 1) / nty; const dim3 grid_(nbx, nby); const dim3 block_(ntx, nty); - ManipulateMeanGradCUDAKernelV2< + ManipulateMeanGradCUDAKernelForMulX< T, IndexT><<>>( out_grad, @@ -203,6 +203,24 @@ void CalculateXGrad(const Context& ctx, } } +template +void CalculateEGrad(const Context& ctx, + const T* out_grad, + const T* x_data, + const T* e_data, + const phi::DDim& out_grad_dims, + const phi::DDim& x_dims, + const phi::DDim& e_dims, + const IndexT* s_index, + const IndexT* d_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t index_size, + int64_t slice_size, + T* e_grad, + const DenseTensor* dst_count = nullptr, + const DenseTensor* out = nullptr) {} + template void GraphSendERecvGradOpCUDAKernelLaunchHelper( const Context& ctx, From c566dcc333bee16fcae64f0912d6d870ce853d89 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 31 May 2022 07:12:00 +0000 Subject: [PATCH 12/51] temp commit --- .../gpu/graph_send_e_recv_grad_kernel.cu | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 8f2540af24ae8..32fc777b9db90 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -219,7 +219,18 @@ void CalculateEGrad(const Context& ctx, int64_t slice_size, T* e_grad, const DenseTensor* dst_count = nullptr, - const DenseTensor* out = nullptr) {} + const DenseTensor* out = nullptr) { + if (pool_type == "SUM") { + if (compute_type == "ADD") { + } else if (compute_type == "MUL") { + } + } else if (pool_type == "MEAN") { + const int* s_count = dst_count->data(); + if (compute_type == "ADD") { + } else if (compute_type == "MUL") { + } + } +} template void GraphSendERecvGradOpCUDAKernelLaunchHelper( @@ -287,7 +298,22 @@ void GraphSendERecvGradOpCUDAKernelLaunchHelper( x_grad_data, dst_count, out); - CalculateEGrad(); + CalculateEGrad(ctx, + out_grad_data, + x_data, + e_data, + out_grad.dims(), + x_dims, + e_dims, + s_index, + d_index, + compute_type, + pool_type, + index_size, + slice_size, + e_grad_data, + dst_count, + out); } else if (pool_type == "MIN" || pool_type == "MAX") { CalculateXEGradForMinMax(ctx, out_grad_data, From de96e57ae22ae8ec49d3338f26c5535b86c9d0f7 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 1 Jun 2022 08:45:49 +0000 Subject: [PATCH 13/51] add e_grad for sum and mean --- .../phi/kernels/gpu/graph_send_e_recv_funcs.h | 149 ++++++++++++++++-- .../gpu/graph_send_e_recv_grad_kernel.cu | 67 +++++++- .../kernels/gpu/graph_send_e_recv_kernel.cu | 2 +- .../impl/graph_send_e_recv_kernel_impl.h | 2 +- 4 files changed, 203 insertions(+), 17 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h index bb359bb00b5f6..2a95f7cc3a91a 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h @@ -120,17 +120,17 @@ __global__ void GraphSendERecvCUDAKernel(const T* x_data, // x_grad: for backward mean with mul. template -__global__ void ManipulateMeanGradCUDAKernelForMulX(const T* x_data, +__global__ void ManipulateMeanGradCUDAKernelForMulX(const T* out_grad_data, const T* e_data, const IndexT* src_indices, const IndexT* dst_indices, const int* dst_count, - const int64_t* xbcast_off, - const int64_t* ebcast_off, + const int64_t* l_bcastoff, + const int64_t* r_bcastoff, T* x_grad, int64_t index_size, - int64_t x_len, - int64_t e_len, + int64_t l_len, + int64_t r_len, int64_t out_len, bool use_bcast) { IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; @@ -142,13 +142,13 @@ __global__ void ManipulateMeanGradCUDAKernelForMulX(const T* x_data, int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; int64_t stride_x = blockDim.x * gridDim.x; - const T* x_off = x_data + src * x_len; - const T* e_off = e_data + ty * e_len; + const T* out_grad_off = out_grad_data + src * l_len; + const T* e_off = e_data + ty * r_len; T* x_grad_off = x_grad + dst * out_len; while (tx < out_len) { - int64_t x_add = use_bcast ? xbcast_off[tx] : tx; - int64_t e_add = use_bcast ? ebcast_off[tx] : tx; - T val = x_off[x_add] * e_off[e_add]; + int64_t o_add = use_bcast ? l_bcastoff[tx] : tx; + int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; + T val = out_grad_off[o_add] * e_off[e_add]; paddle::platform::CudaAtomicAdd(x_grad_off + tx, val / dst_count[src]); tx += stride_x; } @@ -158,16 +158,137 @@ __global__ void ManipulateMeanGradCUDAKernelForMulX(const T* x_data, // e_grad: backward sum for add. template -__global__ void ManipulateSumGradCUDAKernelForAddE() {} +__global__ void ManipulateSumGradCUDAKernelForAddE(const T* out_grad_data, + const IndexT* dst_indices, + const int64_t* r_bcastoff, + T* e_grad, + int64_t index_size, + int64_t r_len, + int64_t out_len, + bool use_bcast) { + IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; + const IndexT stride_y = blockDim.y * gridDim.y; + + while (ty < index_size) { + IndexT dst = dst_indices[ty]; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t stride_x = blockDim.x * gridDim.x; + + T* e_grad_off = e_grad + ty * r_len; + const T* out_grad_off = out_grad_data + dst * out_len; + while (tx < out_len) { + int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; + paddle::platform::CudaAtomicAdd(e_grad_off + e_add, out_grad_off[tx]); + tx += stride_x; + } + ty += stride_y; + } +} // e_grad: backward sum for mul. -__global__ void ManipulateSumGradCUDAKernelForMulE() {} +template +__global__ void ManipulateSumGradCUDAKernelForMulE(const T* x_data, + const T* out_grad_data, + const IndexT* src_indices, + const IndexT* dst_indices, + const int64_t* l_bcastoff, + const int64_t* r_bcastoff, + T* e_grad, + int64_t index_size, + int64_t l_len, + int64_t r_len, + int64_t out_len, + bool use_bcast) { + IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; + const IndexT stride_y = blockDim.y * gridDim.y; + + while (ty < index_size) { + IndexT src = src_indices[ty]; + IndexT dst = dst_indices[ty]; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t stride_x = blockDim.x * gridDim.x; + + const T* x_off = x_data + src * l_len; + T* e_grad_off = e_grad + ty * r_len; + const T* out_grad_off = out_grad_data + dst * out_len; + while (tx < out_len) { + int64_t x_add = use_bcast ? l_bcastoff[tx] : tx; + int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; + paddle::platform::CudaAtomicAdd(e_grad_off + e_add, + out_grad_off[tx] * x_off[x_add]); + tx += stride_x; + } + ty += stride_y; + } +} // e_grad: backward mean for add -__global__ void ManipulateMeanGradCUDAKernelForAddE() {} +template +__global__ void ManipulateMeanGradCUDAKernelForAddE(const T* out_grad_data, + const IndexT* dst_indices, + const int* dst_count, + const int64_t* r_bcastoff, + T* e_grad, + int64_t index_size, + int64_t r_len, + int64_t out_len, + bool use_bcast) { + IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; + const IndexT stride_y = blockDim.y * gridDim.y; + + while (ty < index_size) { + IndexT dst = dst_indices[ty]; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t stride_x = blockDim.x * gridDim.x; + + T* e_grad_off = e_grad + ty * r_len; + const T* out_grad_off = out_grad_data + dst * out_len; + while (tx < out_len) { + int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; + paddle::platform::CudaAtomicAdd(e_grad_off + e_add, + out_grad_off[tx] / dst_count[dst]); + tx += stride_x; + } + ty += stride_y; + } +} // e_grad: backward mean for mul. -__global__ void ManipulateMeanGradCUDAKernelForMulE() {} +__global__ void ManipulateMeanGradCUDAKernelForMulE(const T* x_data, + const T* out_grad_data, + const IndexT* src_indices, + const IndexT* dst_indices, + const int* dst_count, + const int64_t* l_bcastoff, + const int64_t* r_bcastoff, + T* e_grad, + int64_t index_size, + int64_t l_len, + int64_t r_len, + int64_t out_len, + bool use_bcast) { + IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; + const IndexT stride_y = blockDim.y * gridDim.y; + + while (ty < index_size) { + IndexT src = src_indices[ty]; + IndexT dst = dst_indices[ty]; + int64_t tx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t stride_x = blockDim.x * gridDim.x; + + const T* x_off = x_data + src * l_len; + T* e_grad_off = e_grad + ty * r_len; + const T* out_grad_off = out_grad_data + dst * out_len; + while (tx < out_len) { + int64_t x_add = use_bcast ? l_bcastoff[tx] : tx; + int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; + paddle::platform::CudaAtomicAdd( + e_grad_off + e_add, out_grad_off[tx] * x_off[x_add] / dst_count[dst]); + tx += stride_x; + } + ty += stride_y; + } +} // x_grad, e_grad: backward min and max for add. template diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 32fc777b9db90..b54c3c6000443 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -216,18 +216,83 @@ void CalculateEGrad(const Context& ctx, const std::string& compute_type, const std::string& pool_type, int64_t index_size, - int64_t slice_size, T* e_grad, const DenseTensor* dst_count = nullptr, const DenseTensor* out = nullptr) { + const auto& bcast_info = CalcBCastInfo(x_dims, e_dims); + thrust::device_vector l_bcastoff, r_bcastoff; + if (bcast_info.use_bcast) { + CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); + } + int64_t out_len = bcast_info.out_len; + const int ntx = FindNumThreads(out_len); + const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int nbx = (out_len + ntx - 1) / ntx; + const int nby = (index_size + nty - 1) / nty; + const dim3 grid(nbx, nby); + const dim3 block_(ntx, nty); if (pool_type == "SUM") { if (compute_type == "ADD") { + ManipulateSumGradCUDAKernelForAddE< + T, + IndexT><<>>( + out_grad, + d_index, + thrust::raw_pointer_cast(r_bcastoff.data()), + e_grad, + index_size, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } else if (compute_type == "MUL") { + ManipulateSumGradCUDAKernelForMulE< + T, + IndexT><<>>( + x_data, + out_grad, + s_index, + d_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + e_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } } else if (pool_type == "MEAN") { const int* s_count = dst_count->data(); if (compute_type == "ADD") { + ManipulateMeanGradCUDAKernelForAddE< + T, + IndexT><<>>( + out_grad, + d_index, + s_count, + thrust::raw_pointer_cast(r_bcastoff.data()), + e_grad, + index_size, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } else if (compute_type == "MUL") { + ManipulateMeanGradCUDAKernelForMulE< + T, + IndexT><<>>( + x_data, + out_grad, + s_index, + d_index, + s_count, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + e_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } } } diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu index 83e7931a0bddc..d9bd36450c05f 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu @@ -71,7 +71,7 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, if (index_size == 0) return; - const auto& bcast_info = CalcBCastInfo(x.dims(), e.dims()); + const auto& bcast_info = CalcBCastInfo(x.dims(), e.dims(), compute_type); const T* x_data = x.data(); const T* e_data = e.data(); const IndexT* s_index = src_index.data(); diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h index 322fa5bb333b8..5f896ca015bb2 100644 --- a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h @@ -39,7 +39,7 @@ bool UseBroadCast(const phi::DDim& l_dims, const phi::DDim& r_dims) { BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims, const phi::DDim& r_dims) { BroadCastInfo binfo; - binfo.use_bcast = UseBroadCast(l_dims, r_dims); + binfo.use_bcast = UseBroadCast(l_dims, r_dims, op); binfo.l_len = 1; binfo.r_len = 1; for (int i = 1; i < l_dims.size(); i++) { From 7f6fb7297a562558063a53c391a31d1358ad4a73 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 1 Jun 2022 11:52:14 +0000 Subject: [PATCH 14/51] fix some compile bug --- paddle/phi/infermeta/multiary.cc | 20 ++--- .../phi/kernels/gpu/graph_send_e_recv_funcs.h | 6 +- .../gpu/graph_send_e_recv_grad_kernel.cu | 21 ++--- .../kernels/gpu/graph_send_e_recv_kernel.cu | 78 ++++++++++--------- .../impl/graph_send_e_recv_kernel_impl.cc | 77 ++++++++++++++++++ .../impl/graph_send_e_recv_kernel_impl.h | 58 +------------- 6 files changed, 147 insertions(+), 113 deletions(-) create mode 100644 paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.cc diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index d506a35c967e3..4f08ced0c607b 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2381,24 +2381,24 @@ void GraphSendERecvInferMeta(const MetaTensor& x, // Infer out's shape according to x and e(need broadcasting condition) out->set_dtype(x.dtype()); - std::vector x_dims1 = phi::vectorize(x_dims); - std::vector e_dims1 = phi::vectorize(e_dims); + auto x_dims1 = phi::vectorize(x_dims); + auto e_dims1 = phi::vectorize(e_dims); std::vector x_dims2(x_dims1.begin() + 1, x_dims1.end()); std::vector e_dims2(e_dims1.begin() + 1, e_dims1.end()); int max_dim = std::max(x_dims2.size(), e_dims2.size()); - int axis = std::abs(x_dims2.size() - e_dims2.size()); + int axis = std::abs((int)(x_dims2.size() - e_dims2.size())); std::vector x_dims_array(max_dim); std::vector e_dims_array(max_dim); std::vector out_dims_array(max_dim); // Only need to broadcast dimensions other than the 0th dimension. - GetBroadcastDimsArrays(phi::make_ddim(x_dims2), - phi::make_ddim(e_dims2), - x_dims_array.data(), - e_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); + phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2), + phi::make_ddim(e_dims2), + x_dims_array.data(), + e_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); if (out_size <= 0) { out_dims_array.insert(out_dims_array.begin(), x_dims[0]); } else { diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h index 2a95f7cc3a91a..31514d4505a3e 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h @@ -14,6 +14,7 @@ #pragma once #include "paddle/phi/kernels/graph_send_e_recv_kernel.h" +#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" #include #include @@ -46,7 +47,7 @@ void CopyBCastOff(const BroadCastInfo& bcast_info, inline int FindNumThreads(int dim, int max_num_threads = CUDA_MAX_NUM_THREADS) { PADDLE_ENFORCE_GE(dim, 0, - platform::errors::PreconditionNotMet( + phi::errors::PreconditionNotMet( "Required dim >= 0, but received dim = %d", dim)); if (dim == 0) return 1; int res = max_num_threads; @@ -72,7 +73,7 @@ struct GraphSendERecvMaxCUDAFunctor { template struct GraphSendERecvMinCUDAFunctor { - DEVICE inline void operator()(T* output, val) { + DEVICE inline void operator()(T* output, T val) { paddle::platform::CudaAtomicMin(output, val); } }; @@ -254,6 +255,7 @@ __global__ void ManipulateMeanGradCUDAKernelForAddE(const T* out_grad_data, } // e_grad: backward mean for mul. +template __global__ void ManipulateMeanGradCUDAKernelForMulE(const T* x_data, const T* out_grad_data, const IndexT* src_indices, diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index b54c3c6000443..f5d901d2d030b 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -15,11 +15,12 @@ #include "paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h" #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" #include "paddle/phi/kernels/graph_send_e_recv_grad_kernel.h" -#include "paddle/phi/kernels/impl/graph_send_e_recv_funcs.h" +#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" namespace phi { @@ -41,7 +42,7 @@ void CalculateXEGradForMinMax(const Context& ctx, T* e_grad, const DenseTensor* out = nullptr) { const T* out_data = out->data(); - const auto& bcast_info = CalcBCastInfo(x_dims, e_dims); + const auto& bcast_info = phi::CalcBCastInfo(x_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); @@ -121,7 +122,7 @@ void CalculateXGrad(const Context& ctx, int64_t n = slice_size * index_size; int max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dim; + int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; if (pool_type == "SUM") { if (compute_type == "ADD") { GraphSendRecvSumCUDAFunctor functor; @@ -132,7 +133,7 @@ void CalculateXGrad(const Context& ctx, IndexT>><<>>( out_grad, d_index, s_index, x_grad, index_size, slice_size, functor); } else if (compute_type == "MUL") { - const auto& bcast_info = CalcBCastInfo(out_grad_dims, e_dims); + const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); @@ -144,12 +145,13 @@ void CalculateXGrad(const Context& ctx, const int nby = (index_size + nty - 1) / nty; const dim3 grid_(nbx, nby); const dim3 block_(ntx, nty); - MultiplyFunctor mul_functor; + funcs::MultiplyFunctor mul_functor; + GraphSendERecvSumCUDAFunctor sum_functor; GraphSendERecvCUDAKernel< T, IndexT, GraphSendERecvSumCUDAFunctor, - MultiplyFunctor><<>>( + funcs::MultiplyFunctor><<>>( out_grad, e_data, d_index, @@ -171,7 +173,7 @@ void CalculateXGrad(const Context& ctx, ManipulateMeanGradCUDAKernel<<>>( out_grad, d_index, s_index, x_grad, index_size, slice_size, s_count); } else if (compute_type == "MUL") { - const auto& bcast_info = CalcBCastInfo(out_grad_dims, e_dims); + const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); @@ -219,7 +221,7 @@ void CalculateEGrad(const Context& ctx, T* e_grad, const DenseTensor* dst_count = nullptr, const DenseTensor* out = nullptr) { - const auto& bcast_info = CalcBCastInfo(x_dims, e_dims); + const auto& bcast_info = phi::CalcBCastInfo(x_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); @@ -230,7 +232,7 @@ void CalculateEGrad(const Context& ctx, const int nbx = (out_len + ntx - 1) / ntx; const int nby = (index_size + nty - 1) / nty; const dim3 grid(nbx, nby); - const dim3 block_(ntx, nty); + const dim3 block(ntx, nty); if (pool_type == "SUM") { if (compute_type == "ADD") { ManipulateSumGradCUDAKernelForAddE< @@ -375,7 +377,6 @@ void GraphSendERecvGradOpCUDAKernelLaunchHelper( compute_type, pool_type, index_size, - slice_size, e_grad_data, dst_count, out); diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu index d9bd36450c05f..375e7209bf3a9 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu @@ -71,7 +71,7 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, if (index_size == 0) return; - const auto& bcast_info = CalcBCastInfo(x.dims(), e.dims(), compute_type); + const auto& bcast_info = phi::CalcBCastInfo(x.dims(), e.dims()); const T* x_data = x.data(); const T* e_data = e.data(); const IndexT* s_index = src_index.data(); @@ -98,17 +98,18 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, if (pool_type == "SUM" || pool_type == "MEAN") { GraphSendERecvSumCUDAFunctor sum_functor; if (compute_type == "ADD") { - AddFunctor add_funtor; - GraphSendERecvCUDAKernel, - AddFunctor><<>>( + funcs::AddFunctor add_funtor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvSumCUDAFunctor, + funcs::AddFunctor><<>>( x_data, e_data, s_index, d_index, - thrust::raw_pointer_cast(x_bcastoff.data()); - thrust::raw_pointer_cast(e_bcastoff.data()); + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), out_data, index_size, bcast_info.l_len, @@ -118,18 +119,18 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, add_funtor, sum_functor); } else if (compute_type == "MUL") { - MultiplyFunctor mul_functor; + funcs::MultiplyFunctor mul_functor; GraphSendERecvCUDAKernel< T, IndexT, GraphSendERecvSumCUDAFunctor, - MultiplyFunctor><<>>( + funcs::MultiplyFunctor><<>>( x_data, e_data, s_index, d_index, - thrust::raw_pointer_cast(x_bcastoff.data()); - thrust::raw_pointer_cast(e_bcastoff.data()); + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), out_data, index_size, bcast_info.l_len, @@ -140,8 +141,8 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, sum_functor); } if (pool_type == "MEAN") { - ctx.template Alloc(dst_count); - int32_t* dst_count_data = dst_count->data(); + ctx.template Alloc(dst_count); + int32_t* dst_count_data = dst_count->data(); if (out_size > 0) { input_size = out_size; } @@ -156,6 +157,7 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, dst_count_data, d_index, index_size); int64_t grid_mean = (input_size * out_len + block_ - 1) / block_; + int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_mean_ = grid_mean < max_grid_dimx ? grid_mean : max_grid_dimx; ManipulateMeanCUDAKernel<<>>( @@ -164,17 +166,18 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, } else if (pool_type == "MAX") { GraphSendERecvMaxCUDAFunctor max_functor; if (compute_type == "ADD") { - AddFunctor add_funtor; - GraphSendERecvCUDAKernel, - AddFunctor><<>>( + funcs::AddFunctor add_funtor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvMaxCUDAFunctor, + funcs::AddFunctor><<>>( x_data, e_data, s_index, d_index, - thrust::raw_pointer_cast(x_bcastoff.data()); - thrust::raw_pointer_cast(e_bcastoff.data()); + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), out_data, index_size, bcast_info.l_len, @@ -184,18 +187,18 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, add_funtor, max_functor); } else if (compute_type == "MUL") { - MultiplyFunctor mul_functor; + funcs::MultiplyFunctor mul_functor; GraphSendERecvCUDAKernel< T, IndexT, GraphSendERecvMaxCUDAFunctor, - MultiplyFunctor><<>>( + funcs::MultiplyFunctor><<>>( x_data, e_data, s_index, d_index, - thrust::raw_pointer_cast(x_bcastoff.data()); - thrust::raw_pointer_cast(e_bcastoff.data()); + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), out_data, index_size, bcast_info.l_len, @@ -209,23 +212,25 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, input_size = out_size; } int64_t grid_max = (input_size * out_len + block_ - 1) / block_; + int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_max_ = grid_max < max_grid_dimx ? grid_max : max_grid_dimx; InputResetMaxCUDAKernel<<>>( out_data, input_size, out_len); } else if (pool_type == "MIN") { GraphSendERecvMinCUDAFunctor min_functor; if (compute_type == "ADD") { - AddFunctor add_funtor; - GraphSendERecvCUDAKernel, - AddFunctor><<>>( + funcs::AddFunctor add_funtor; + GraphSendERecvCUDAKernel< + T, + IndexT, + GraphSendERecvMinCUDAFunctor, + funcs::AddFunctor><<>>( x_data, e_data, s_index, d_index, - thrust::raw_pointer_cast(x_bcastoff.data()); - thrust::raw_pointer_cast(e_bcastoff.data()); + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), out_data, index_size, bcast_info.l_len, @@ -235,18 +240,18 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, add_funtor, min_functor); } else if (compute_type == "MUL") { - MultiplyFunctor mul_functor; + funcs::MultiplyFunctor mul_functor; GraphSendERecvCUDAKernel< T, IndexT, GraphSendERecvMinCUDAFunctor, - MultiplyFunctor><<>>( + funcs::MultiplyFunctor><<>>( x_data, e_data, s_index, d_index, - thrust::raw_pointer_cast(x_bcastoff.data()); - thrust::raw_pointer_cast(e_bcastoff.data()); + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), out_data, index_size, bcast_info.l_len, @@ -260,6 +265,7 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, input_size = out_size; } int64_t grid_min = (input_size * out_len + block_ - 1) / block_; + int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_min_ = grid_min < max_grid_dimx ? grid_min : max_grid_dimx; InputResetMinCUDAKernel<<>>( out_data, input_size, out_len); diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.cc b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.cc new file mode 100644 index 0000000000000..e6415a748a3aa --- /dev/null +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" + +namespace phi { + +bool UseBroadCast(const phi::DDim& l_dims, const phi::DDim& r_dims) { + if (l_dims.size() != r_dims.size()) { + return true; + } + for (int i = 1; i < l_dims.size(); i++) { + if (l_dims[i] != r_dims[i]) { + return true; + } + } + return false; +} + +BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims, const phi::DDim& r_dims) { + BroadCastInfo binfo; + binfo.use_bcast = UseBroadCast(l_dims, r_dims); + binfo.l_len = 1; + binfo.r_len = 1; + for (int i = 1; i < l_dims.size(); i++) { + binfo.l_len *= l_dims[i]; + } + for (int i = 1; i < r_dims.size(); i++) { + binfo.r_len *= r_dims[i]; + } + // TODO(daisiming): Whether to add dot. + binfo.reduce_size = 1; + if (binfo.use_bcast) { + const int max_dim = std::max(l_dims.size(), r_dims.size()) - 1; + int stride_l = 1, stride_r = 1; + binfo.l_offset.emplace_back(0); + binfo.r_offset.emplace_back(0); + int out_len = 1; + for (int i = 0; i < max_dim; i++) { + // Iterate the axis from back to front. + const int dl = + (l_dims.size() - 1 - i < 1) ? 1 : l_dims[l_dims.size() - 1 - i]; + const int dr = + (r_dims.size() - 1 - i < 1) ? 1 : r_dims[r_dims.size() - 1 - i]; + for (int j = 0; j < std::max(dl, dr); j++) { + for (int k = 0; k < out_len; k++) { + binfo.l_offset.emplace_back(binfo.l_offset[k] + + j * (j < dl) * stride_l); + binfo.r_offset.emplace_back(binfo.r_offset[k] + + j * (j < dr) * stride_r); + } + } + out_len *= std::max(dl, dr); + stride_l *= dl; + stride_r *= dr; + } + binfo.out_len = out_len; + } else { + binfo.out_len = binfo.l_len; + } + return binfo; +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h index 5f896ca015bb2..57caf246dd046 100644 --- a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h @@ -13,7 +13,9 @@ // limitations under the License. #pragma once + #include +#include "paddle/phi/kernels/funcs/eigen/common.h" namespace phi { @@ -25,60 +27,6 @@ struct BroadCastInfo { int64_t l_len, r_len, out_len, reduce_size; }; -bool UseBroadCast(const phi::DDim& l_dims, const phi::DDim& r_dims) { - if (l_dims.size() != r_dims.size()) { - return true; - } - for (int i = 1; i < l_dims.size(); i++) { - if (l_dims[i] != r_dims[i]) { - return true; - } - } - return false; -} - -BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims, const phi::DDim& r_dims) { - BroadCastInfo binfo; - binfo.use_bcast = UseBroadCast(l_dims, r_dims, op); - binfo.l_len = 1; - binfo.r_len = 1; - for (int i = 1; i < l_dims.size(); i++) { - binfo.l_len *= l_dims[i]; - } - for (int i = 1; i < r_dims.size(); i++) { - binfo.r_len *= r_dims[i]; - } - // TODO(daisiming): Whether to add dot. - binfo.reduce_size = 1; - if (binfo.use_bcast) { - const int max_dim = std::max(l_dims.size(), r_dims.size()) - 1; - int stride_l = 1, stride_r = 1; - binfo.l_offset.emplace_back(0); - binfo.r_offset.emplace_back(0); - int out_len = 1; - for (int i = 0; i < max_dim; i++) { - // Iterate the axis from back to front. - const int dl = - (l_dims.size() - 1 - i < 1) ? 1 : l_dims[l_dims.size() - 1 - i]; - const int dr = - (r_dims.size() - 1 - i < 1) ? 1 : r_dims[r_dims.size() - 1 - i]; - for (int j = 0; j < std::max(dl, dr); j++) { - for (int k = 0; k < out_len; k++) { - binfo.l_offset.emplace_back(binfo.l_offset[k] + - j * (j < dl) * stride_l); - binfo.r_offset.emplace_back(binfo.r_offset[k] + - j * (j < dr) * stride_r); - } - } - out_len *= std::max(dl, dr); - stride_l *= dl; - stride_r *= dr; - } - binfo.out_len = out_len; - } else { - binfo.out_len = binfo.l_len; - } - return binfo; -} +BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims, const phi::DDim& r_dims); } // namespace phi From 375be763f87ec35bbc9342f9abed37f9943de128 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 1 Jun 2022 12:40:06 +0000 Subject: [PATCH 15/51] fix compile bugs --- .../fluid/operators/graph_send_e_recv_op.cc | 4 +- .../phi/kernels/gpu/graph_send_e_recv_funcs.h | 7 +- .../impl/graph_send_e_recv_kernel_impl.cc | 77 ------------------- .../impl/graph_send_e_recv_kernel_impl.h | 58 +++++++++++++- 4 files changed, 62 insertions(+), 84 deletions(-) delete mode 100644 paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.cc diff --git a/paddle/fluid/operators/graph_send_e_recv_op.cc b/paddle/fluid/operators/graph_send_e_recv_op.cc index 262c1b5994b50..8b1e0f87c9f79 100644 --- a/paddle/fluid/operators/graph_send_e_recv_op.cc +++ b/paddle/fluid/operators/graph_send_e_recv_op.cc @@ -33,7 +33,7 @@ class GraphSendERecvOP : public framework::OperatorWithKernel { } }; -class GraphSendERecvGradOP : public framework::OperatorWithKernel { +class GraphSendERecvGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -68,7 +68,7 @@ class GraphSendERecvOpMaker : public framework::OpProtoAndCheckerMaker { .AsIntermediate(); AddAttr("compute_type", "(string, default 'ADD')" - "Define differenct computation types between X and E".) + "Define differenct computation types between X and E.") .SetDefault("ADD") .InEnum({"ADD", "MUL"}); AddAttr("pool_type", diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h index 31514d4505a3e..42b9c0fe2a1f9 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h @@ -13,7 +13,6 @@ // limitations under the License. #pragma once -#include "paddle/phi/kernels/graph_send_e_recv_kernel.h" #include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" #include @@ -29,9 +28,9 @@ namespace phi { #define CUDA_MAX_NUM_THREADS 1024 -void CopyBCastOff(const BroadCastInfo& bcast_info, - thrust::device_vector& l_bcastoff, - thrust::device_vector& r_bcastoff) { +inline void CopyBCastOff(const BroadCastInfo& bcast_info, + thrust::device_vector& l_bcastoff, + thrust::device_vector& r_bcastoff) { l_bcastoff.resize(bcast_info.out_len); r_bcastoff.resize(bcast_info.out_len); cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff.data()), diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.cc b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.cc deleted file mode 100644 index e6415a748a3aa..0000000000000 --- a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.cc +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" - -namespace phi { - -bool UseBroadCast(const phi::DDim& l_dims, const phi::DDim& r_dims) { - if (l_dims.size() != r_dims.size()) { - return true; - } - for (int i = 1; i < l_dims.size(); i++) { - if (l_dims[i] != r_dims[i]) { - return true; - } - } - return false; -} - -BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims, const phi::DDim& r_dims) { - BroadCastInfo binfo; - binfo.use_bcast = UseBroadCast(l_dims, r_dims); - binfo.l_len = 1; - binfo.r_len = 1; - for (int i = 1; i < l_dims.size(); i++) { - binfo.l_len *= l_dims[i]; - } - for (int i = 1; i < r_dims.size(); i++) { - binfo.r_len *= r_dims[i]; - } - // TODO(daisiming): Whether to add dot. - binfo.reduce_size = 1; - if (binfo.use_bcast) { - const int max_dim = std::max(l_dims.size(), r_dims.size()) - 1; - int stride_l = 1, stride_r = 1; - binfo.l_offset.emplace_back(0); - binfo.r_offset.emplace_back(0); - int out_len = 1; - for (int i = 0; i < max_dim; i++) { - // Iterate the axis from back to front. - const int dl = - (l_dims.size() - 1 - i < 1) ? 1 : l_dims[l_dims.size() - 1 - i]; - const int dr = - (r_dims.size() - 1 - i < 1) ? 1 : r_dims[r_dims.size() - 1 - i]; - for (int j = 0; j < std::max(dl, dr); j++) { - for (int k = 0; k < out_len; k++) { - binfo.l_offset.emplace_back(binfo.l_offset[k] + - j * (j < dl) * stride_l); - binfo.r_offset.emplace_back(binfo.r_offset[k] + - j * (j < dr) * stride_r); - } - } - out_len *= std::max(dl, dr); - stride_l *= dl; - stride_r *= dr; - } - binfo.out_len = out_len; - } else { - binfo.out_len = binfo.l_len; - } - return binfo; -} - -} // namespace phi diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h index 57caf246dd046..5c8d5340693ae 100644 --- a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h @@ -15,6 +15,7 @@ #pragma once #include + #include "paddle/phi/kernels/funcs/eigen/common.h" namespace phi { @@ -27,6 +28,61 @@ struct BroadCastInfo { int64_t l_len, r_len, out_len, reduce_size; }; -BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims, const phi::DDim& r_dims); +inline bool UseBroadCast(const phi::DDim& l_dims, const phi::DDim& r_dims) { + if (l_dims.size() != r_dims.size()) { + return true; + } + for (int i = 1; i < l_dims.size(); i++) { + if (l_dims[i] != r_dims[i]) { + return true; + } + } + return false; +} + +inline BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims, + const phi::DDim& r_dims) { + BroadCastInfo binfo; + binfo.use_bcast = UseBroadCast(l_dims, r_dims); + binfo.l_len = 1; + binfo.r_len = 1; + for (int i = 1; i < l_dims.size(); i++) { + binfo.l_len *= l_dims[i]; + } + for (int i = 1; i < r_dims.size(); i++) { + binfo.r_len *= r_dims[i]; + } + // TODO(daisiming): Whether to add dot. + binfo.reduce_size = 1; + if (binfo.use_bcast) { + const int max_dim = std::max(l_dims.size(), r_dims.size()) - 1; + int stride_l = 1, stride_r = 1; + binfo.l_offset.emplace_back(0); + binfo.r_offset.emplace_back(0); + int out_len = 1; + for (int i = 0; i < max_dim; i++) { + // Iterate the axis from back to front. + const int dl = + (l_dims.size() - 1 - i < 1) ? 1 : l_dims[l_dims.size() - 1 - i]; + const int dr = + (r_dims.size() - 1 - i < 1) ? 1 : r_dims[r_dims.size() - 1 - i]; + for (int j = 0; j < std::max(dl, dr); j++) { + for (int k = 0; k < out_len; k++) { + binfo.l_offset.emplace_back(binfo.l_offset[k] + + j * (j < dl) * stride_l); + binfo.r_offset.emplace_back(binfo.r_offset[k] + + j * (j < dr) * stride_r); + } + } + out_len *= std::max(dl, dr); + stride_l *= dl; + stride_r *= dr; + } + binfo.out_len = out_len; + } else { + binfo.out_len = binfo.l_len; + } + return binfo; +} } // namespace phi From 0ea88781226af99dfa548103ee77fa3390568ef6 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 2 Jun 2022 09:18:57 +0000 Subject: [PATCH 16/51] fix compile problem --- paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu | 4 ++-- paddle/phi/kernels/graph_send_e_recv_grad_kernel.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index f5d901d2d030b..59bd784d3f1c9 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -406,8 +406,8 @@ void GraphSendERecvGradKernel(const Context& ctx, const DenseTensor& e, const DenseTensor& src_index, const DenseTensor& dst_index, - paddle::optional out, - paddle::optional dst_count, + const paddle::optional& out, + const paddle::optional& dst_count, const DenseTensor& out_grad, const std::string& compute_type, const std::string& pool_type, diff --git a/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h index 8b5fa72f14f57..cd9a9ee98c84b 100644 --- a/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h +++ b/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h @@ -26,8 +26,8 @@ void GraphSendERecvGradKernel(const Context& ctx, const DenseTensor& e, const DenseTensor& src_index, const DenseTensor& dst_index, - paddle::optional out, - paddle::optional dst_count, + const paddle::optional& out, + const paddle::optional& dst_count, const DenseTensor& out_grad, const std::string& compute_type, const std::string& pool_type, From f1ea92fb43705ca3f5c744cc676b3210063a12ff Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 2 Jun 2022 11:20:25 +0000 Subject: [PATCH 17/51] add sum forward unittest --- .../unittests/test_graph_send_e_recv_op.py | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py new file mode 100644 index 0000000000000..0a1a995fdf39b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py @@ -0,0 +1,135 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.fluid.framework import _test_eager_guard + +from op_test import OpTest + + +class TestGraphSendERecvSumOp(OpTest): + def setUp(self): + paddle.enable_static() + self.op_type = "graph_send_e_recv" + self.set_config() + self.inputs = { + 'X': self.x, + 'E': self.e, + 'Src_index': self.src_index, + 'Dst_index': self.dst_index + } + self.attrs = {'compute_type': self.compute_type, 'pool_type': 'SUM'} + + out = compute_graph_send_e_recv_for_sum(self.inputs, self.attrs) + + self.outputs = {'Out': out} + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + def test_check_output(self): + self.check_output() + + +def TestSumCase1(TestGraphSendERecvSumOp): + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 1)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + +def TestSumCase2(TestGraphSendERecvSumOp): + def set_config(self): + self.x = np.random.random((10, 1)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + +def TestSumCase3(TestGraphSendERecvSumOp): + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +def TestSumCase4(TestGraphSendERecvSumOp): + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 1)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +def TestSumCase5(TestGraphSendERecvSumOp): + def set_config(self): + self.x = np.random.random((10, 1)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +def get_broadcast_shape(shp1, shp2): + pad_shp1, pad_shp2 = shp1, shp2 + if len(shp1) > len(shp2): + pad_shp2 = [1, ] * (len(shp1) - len(shp2)) + shp2 + elif len(shp1) < len(shp2): + pad_shp1 = [1, ] * (len(shp2) - len(shp1)) + shp1 + for d1, d2 in zip(pad_shp1, pad_shp2): + if d1 != d2 and d1 != 1 and d2 != 1: + raise ValueError + rst = [max(d1, d2) for d1, d2 in zip(pad_shp1, pad_shp2)] + return rst + + +def compute_graph_send_e_recv_for_sum(inputs, attributes): + x = inputs['X'] + e = inputs['E'] + src_index = inputs['Src_index'] + dst_index = inputs['Dst_index'] + compute_type = attributes['compute_type'] + + gather_x = x[src_index] + out_shp = [x.shape[0], ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) + results = np.zeros(out_shp, dtype=x.dtype) + + # Calculate forward output + if compute_type == 'ADD': + x_compute_e = gather_x + e + elif compute_type == 'MUL': + x_compute_e = gather_x * e + for index, s_id in enumerate(dst_index): + results[s_id, :] += x_compute_e[index, :] + return results From f961f9b31156e9b6796e2dcf707d68f91a291ed8 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 6 Jun 2022 08:37:11 +0000 Subject: [PATCH 18/51] fix broadcast error, add kernel sig, register e_grad, change unit test --- .../fluid/operators/graph_send_e_recv_op.cc | 1 + .../impl/graph_send_e_recv_kernel_impl.h | 2 +- .../phi/ops/compat/graph_send_e_recv_sig.cc | 42 ++++++++ .../unittests/test_graph_send_e_recv_op.py | 100 +++++++++--------- 4 files changed, 95 insertions(+), 50 deletions(-) create mode 100644 paddle/phi/ops/compat/graph_send_e_recv_sig.cc diff --git a/paddle/fluid/operators/graph_send_e_recv_op.cc b/paddle/fluid/operators/graph_send_e_recv_op.cc index 8b1e0f87c9f79..994153f50af81 100644 --- a/paddle/fluid/operators/graph_send_e_recv_op.cc +++ b/paddle/fluid/operators/graph_send_e_recv_op.cc @@ -124,6 +124,7 @@ class GraphSendERecvGradOpMaker : public framework::SingleGradOpMaker { op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetOutput(framework::GradVarName("E"), this->InputGrad("E")); op->SetAttrMap(this->Attrs()); } }; diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h index 5c8d5340693ae..23bcae5e077fa 100644 --- a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h @@ -66,7 +66,7 @@ inline BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims, (l_dims.size() - 1 - i < 1) ? 1 : l_dims[l_dims.size() - 1 - i]; const int dr = (r_dims.size() - 1 - i < 1) ? 1 : r_dims[r_dims.size() - 1 - i]; - for (int j = 0; j < std::max(dl, dr); j++) { + for (int j = 1; j < std::max(dl, dr); j++) { for (int k = 0; k < out_len; k++) { binfo.l_offset.emplace_back(binfo.l_offset[k] + j * (j < dl) * stride_l); diff --git a/paddle/phi/ops/compat/graph_send_e_recv_sig.cc b/paddle/phi/ops/compat/graph_send_e_recv_sig.cc new file mode 100644 index 0000000000000..a89708cf35736 --- /dev/null +++ b/paddle/phi/ops/compat/graph_send_e_recv_sig.cc @@ -0,0 +1,42 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GraphSendERecvOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("graph_send_e_recv", + {"X", "E", "Src_index", "Dst_index"}, + {"compute_type", "pool_type", "out_size"}, + {"Out", "Dst_count"}); +} + +KernelSignature GraphSendERecvGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "graph_send_e_recv_grad", + {"X", "E", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, + {"compute_type", "pool_type"}, + {"X@GRAD", "E@GRAD"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(graph_send_e_recv, + phi::GraphSendERecvOpArgumentMapping); + +PD_REGISTER_ARG_MAPPING_FN(graph_send_e_recv_grad, + phi::GraphSendERecvGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py index 0a1a995fdf39b..f2abf65e05361 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py @@ -13,7 +13,6 @@ # limitations under the License. import unittest - import numpy as np import paddle import paddle.fluid as fluid @@ -22,6 +21,40 @@ from op_test import OpTest +def get_broadcast_shape(shp1, shp2): + pad_shp1, pad_shp2 = shp1, shp2 + if len(shp1) > len(shp2): + pad_shp2 = [1, ] * (len(shp1) - len(shp2)) + shp2 + elif len(shp1) < len(shp2): + pad_shp1 = [1, ] * (len(shp2) - len(shp1)) + shp1 + for d1, d2 in zip(pad_shp1, pad_shp2): + if d1 != d2 and d1 != 1 and d2 != 1: + raise ValueError + rst = [max(d1, d2) for d1, d2 in zip(pad_shp1, pad_shp2)] + return rst + + +def compute_graph_send_e_recv_for_sum(inputs, attributes): + x = inputs['X'] + e = inputs['E'] + src_index = inputs['Src_index'] + dst_index = inputs['Dst_index'] + compute_type = attributes['compute_type'] + + gather_x = x[src_index] + out_shp = [x.shape[0], ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) + results = np.zeros(out_shp, dtype=x.dtype) + + # Calculate forward output + if compute_type == 'ADD': + x_compute_e = gather_x + e + elif compute_type == 'MUL': + x_compute_e = gather_x * e + for index, s_id in enumerate(dst_index): + results[s_id, :] += x_compute_e[index, :] + return results + + class TestGraphSendERecvSumOp(OpTest): def setUp(self): paddle.enable_static() @@ -50,28 +83,31 @@ def set_config(self): def test_check_output(self): self.check_output() + def test_check_grad(self): + self.check_grad(['X', 'E'], 'Out') + -def TestSumCase1(TestGraphSendERecvSumOp): +class TestSumCase1(TestGraphSendERecvSumOp): def set_config(self): - self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 1)).astype("float64") - index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.x = np.random.random((100, 20)).astype("float64") + self.e = np.random.random((150, 1)).astype("float64") + index = np.random.randint(0, 100, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] self.compute_type = 'ADD' -def TestSumCase2(TestGraphSendERecvSumOp): +class TestSumCase2(TestGraphSendERecvSumOp): def set_config(self): - self.x = np.random.random((10, 1)).astype("float64") + self.x = np.random.random((100, 1)).astype("float64") self.e = np.random.random((15, 20)).astype("float64") - index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] self.compute_type = 'ADD' -def TestSumCase3(TestGraphSendERecvSumOp): +class TestSumCase3(TestGraphSendERecvSumOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") self.e = np.random.random((15, 20)).astype("float64") @@ -81,55 +117,21 @@ def set_config(self): self.compute_type = 'MUL' -def TestSumCase4(TestGraphSendERecvSumOp): +class TestSumCase4(TestGraphSendERecvSumOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 1)).astype("float64") - index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.e = np.random.random((150, 1)).astype("float64") + index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] self.compute_type = 'MUL' -def TestSumCase5(TestGraphSendERecvSumOp): +class TestSumCase5(TestGraphSendERecvSumOp): def set_config(self): - self.x = np.random.random((10, 1)).astype("float64") + self.x = np.random.random((100, 1)).astype("float64") self.e = np.random.random((15, 20)).astype("float64") - index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] self.compute_type = 'MUL' - - -def get_broadcast_shape(shp1, shp2): - pad_shp1, pad_shp2 = shp1, shp2 - if len(shp1) > len(shp2): - pad_shp2 = [1, ] * (len(shp1) - len(shp2)) + shp2 - elif len(shp1) < len(shp2): - pad_shp1 = [1, ] * (len(shp2) - len(shp1)) + shp1 - for d1, d2 in zip(pad_shp1, pad_shp2): - if d1 != d2 and d1 != 1 and d2 != 1: - raise ValueError - rst = [max(d1, d2) for d1, d2 in zip(pad_shp1, pad_shp2)] - return rst - - -def compute_graph_send_e_recv_for_sum(inputs, attributes): - x = inputs['X'] - e = inputs['E'] - src_index = inputs['Src_index'] - dst_index = inputs['Dst_index'] - compute_type = attributes['compute_type'] - - gather_x = x[src_index] - out_shp = [x.shape[0], ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) - results = np.zeros(out_shp, dtype=x.dtype) - - # Calculate forward output - if compute_type == 'ADD': - x_compute_e = gather_x + e - elif compute_type == 'MUL': - x_compute_e = gather_x * e - for index, s_id in enumerate(dst_index): - results[s_id, :] += x_compute_e[index, :] - return results From 1cbbb4d9a51386bbb9417ca3312e1870f3b00555 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 7 Jun 2022 12:59:55 +0000 Subject: [PATCH 19/51] fix grad --- paddle/phi/infermeta/multiary.cc | 2 +- .../impl/graph_send_e_recv_kernel_impl.h | 29 ++++++++++ .../unittests/test_graph_send_e_recv_op.py | 58 +++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 7737d72e11c1c..5202806676eb5 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2386,7 +2386,7 @@ void GraphSendERecvInferMeta(const MetaTensor& x, std::vector e_dims2(e_dims1.begin() + 1, e_dims1.end()); int max_dim = std::max(x_dims2.size(), e_dims2.size()); - int axis = std::abs((int)(x_dims2.size() - e_dims2.size())); + int axis = std::abs(static_cast(x_dims2.size() - e_dims2.size())); std::vector x_dims_array(max_dim); std::vector e_dims_array(max_dim); std::vector out_dims_array(max_dim); diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h index 23bcae5e077fa..b54f615947d3f 100644 --- a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h @@ -16,6 +16,7 @@ #include +#include "paddle/phi/kernels/funcs/common_shape.h" #include "paddle/phi/kernels/funcs/eigen/common.h" namespace phi { @@ -85,4 +86,32 @@ inline BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims, return binfo; } +inline std::vector InferBroadcastShape(const phi::DDim& x_dims, + const phi::DDim& e_dims, + const std::string& type = "x") { + auto x_dims1 = phi::vectorize(x_dims); + auto e_dims1 = phi::vectorize(e_dims); + std::vector x_dims2(x_dims1.begin() + 1, x_dims1.end()); + std::vector e_dims2(e_dims1.begin() + 1, e_dims1.end()); + int max_dim = std::max(x_dims2.size(), e_dims2.size()); + int axis = std::abs(static_cast(x_dims2.size() - e_dims2.size())); + std::vector x_dims_array(max_dim); + std::vector e_dims_array(max_dim); + std::vector out_dims_array(max_dim); + // Only need to broadcast dimensions other than the 0th dimension. + phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2), + phi::make_ddim(e_dims2), + x_dims_array.data(), + e_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + if (type == "x") { + out_dims_array.insert(out_dims_array.begin(), x_dims[0]); + } else { + out_dims_array.insert(out_dims_array.begin(), e_dims[0]); + } + return out_dims_array; +} + } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py index f2abf65e05361..04008667f9efa 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py @@ -55,6 +55,31 @@ def compute_graph_send_e_recv_for_sum(inputs, attributes): return results +def compute_graph_send_e_recv_for_mean(inputs, attributes): + x = inputs['X'] + e = inputs['E'] + src_index = inputs['Src_index'] + dst_index = inputs['Dst_index'] + compute_type = attributes['compute_type'] + + gather_x = x[src_index] + out_shp = [x.shape[0], ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) + results = np.zeros(out_shp, dtype=x.dtype) + + # Calculate forward output + if compute_type == 'ADD': + x_compute_e = gather_x + e + elif compute_type == 'MUL': + x_compute_e = gather_x * e + count = np.zeros(out_shp[0], dtype=np.int32) + for index, s_id in enumerate(dst_index): + results[s_id, :] += x_compute_e[index, :] + count[s_id] += 1 + results = results / count.reshape([-1, 1]) + results[np.isnan(results)] = 0 + return results, count + + class TestGraphSendERecvSumOp(OpTest): def setUp(self): paddle.enable_static() @@ -135,3 +160,36 @@ def set_config(self): self.src_index = index[:, 0] self.dst_index = index[:, 1] self.compute_type = 'MUL' + + +class TestGraphSendERecvMeanOp(OpTest): + def setUp(self): + paddle.enable_static() + self.op_type = "graph_send_e_recv" + self.set_config() + self.inputs = { + 'X': self.x, + 'E': self.e, + 'Src_index': self.src_index, + 'Dst_index': self.dst_index + } + self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MEAN'} + + out, dst_count = compute_graph_send_e_recv_for_mean(self.inputs, + self.attrs) + + self.outputs = {'Out': out, 'Dst_count': dst_count} + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X', 'E'], 'Out') From e9d57fef5818790be0dadfc0bb0aaa647af1ebfd Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 9 Jun 2022 06:57:16 +0000 Subject: [PATCH 20/51] add temp grad fix --- .../gpu/graph_send_e_recv_grad_kernel.cu | 270 +++++++++--------- 1 file changed, 134 insertions(+), 136 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 59bd784d3f1c9..e23fa68f2f5ec 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -12,15 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h" -#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" -#include "paddle/phi/kernels/graph_send_e_recv_grad_kernel.h" -#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" - #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h" +#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" +#include "paddle/phi/kernels/graph_send_e_recv_grad_kernel.h" +#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" namespace phi { @@ -57,43 +56,41 @@ void CalculateXEGradForMinMax(const Context& ctx, const dim3 block(ntx, nty); if (compute_type == "ADD") { - ManipulateMinMaxGradCUDAKernelForAdd< - T, - IndexT><<>>( - x_data, - e_data, - out_data, - out_grad, - d_index, - s_index, - thrust::raw_pointer_cast(l_bcastoff.data()), - thrust::raw_pointer_cast(r_bcastoff.data()), - x_grad, - e_grad, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast); + ManipulateMinMaxGradCUDAKernelForAdd + <<>>( + x_data, + e_data, + out_data, + out_grad, + d_index, + s_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + e_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } else if (compute_type == "MUL") { - ManipulateMinMaxGradCUDAKernelForMul< - T, - IndexT><<>>( - x_data, - e_data, - out_data, - out_grad, - d_index, - s_index, - thrust::raw_pointer_cast(l_bcastoff.data()), - thrust::raw_pointer_cast(r_bcastoff.data()), - x_grad, - e_grad, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast); + ManipulateMinMaxGradCUDAKernelForMul + <<>>( + x_data, + e_data, + out_data, + out_grad, + d_index, + s_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + e_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } } @@ -126,12 +123,14 @@ void CalculateXGrad(const Context& ctx, if (pool_type == "SUM") { if (compute_type == "ADD") { GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<>>( - out_grad, d_index, s_index, x_grad, index_size, slice_size, functor); + GraphSendRecvCUDAKernel> + <<>>(out_grad, + d_index, + s_index, + x_grad, + index_size, + slice_size, + functor); } else if (compute_type == "MUL") { const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; @@ -147,25 +146,25 @@ void CalculateXGrad(const Context& ctx, const dim3 block_(ntx, nty); funcs::MultiplyFunctor mul_functor; GraphSendERecvSumCUDAFunctor sum_functor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvSumCUDAFunctor, - funcs::MultiplyFunctor><<>>( - out_grad, - e_data, - d_index, - s_index, - thrust::raw_pointer_cast(l_bcastoff.data()), - thrust::raw_pointer_cast(r_bcastoff.data()), - x_grad, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast, - mul_functor, - sum_functor); + GraphSendERecvCUDAKernel, + funcs::MultiplyFunctor> + <<>>( + out_grad, + e_data, + d_index, + s_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast, + mul_functor, + sum_functor); } } else if (pool_type == "MEAN") { const int* s_count = dst_count->data(); @@ -173,7 +172,11 @@ void CalculateXGrad(const Context& ctx, ManipulateMeanGradCUDAKernel<<>>( out_grad, d_index, s_index, x_grad, index_size, slice_size, s_count); } else if (compute_type == "MUL") { + const auto& broad_shape = phi::InferBroadcastShape(out_grad_dims, e_dims); const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); + DenseTensor out_grad_temp; + out_grad_temp.Resize(phi::make_ddim(bcast_info)); + ctx.template Alloc(&out_grad_temp); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); @@ -185,22 +188,21 @@ void CalculateXGrad(const Context& ctx, const int nby = (index_size + nty - 1) / nty; const dim3 grid_(nbx, nby); const dim3 block_(ntx, nty); - ManipulateMeanGradCUDAKernelForMulX< - T, - IndexT><<>>( - out_grad, - e_data, - d_index, - s_index, - s_count, - thrust::raw_pointer_cast(l_bcastoff.data()), - thrust::raw_pointer_cast(r_bcastoff.data()), - x_grad, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast); + ManipulateMeanGradCUDAKernelForMulX + <<>>( + out_grad, + e_data, + d_index, + s_index, + s_count, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } } } @@ -235,66 +237,62 @@ void CalculateEGrad(const Context& ctx, const dim3 block(ntx, nty); if (pool_type == "SUM") { if (compute_type == "ADD") { - ManipulateSumGradCUDAKernelForAddE< - T, - IndexT><<>>( - out_grad, - d_index, - thrust::raw_pointer_cast(r_bcastoff.data()), - e_grad, - index_size, - bcast_info.r_len, - out_len, - bcast_info.use_bcast); + ManipulateSumGradCUDAKernelForAddE + <<>>( + out_grad, + d_index, + thrust::raw_pointer_cast(r_bcastoff.data()), + e_grad, + index_size, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } else if (compute_type == "MUL") { - ManipulateSumGradCUDAKernelForMulE< - T, - IndexT><<>>( - x_data, - out_grad, - s_index, - d_index, - thrust::raw_pointer_cast(l_bcastoff.data()), - thrust::raw_pointer_cast(r_bcastoff.data()), - e_grad, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast); + ManipulateSumGradCUDAKernelForMulE + <<>>( + x_data, + out_grad, + s_index, + d_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + e_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } } else if (pool_type == "MEAN") { const int* s_count = dst_count->data(); if (compute_type == "ADD") { - ManipulateMeanGradCUDAKernelForAddE< - T, - IndexT><<>>( - out_grad, - d_index, - s_count, - thrust::raw_pointer_cast(r_bcastoff.data()), - e_grad, - index_size, - bcast_info.r_len, - out_len, - bcast_info.use_bcast); + ManipulateMeanGradCUDAKernelForAddE + <<>>( + out_grad, + d_index, + s_count, + thrust::raw_pointer_cast(r_bcastoff.data()), + e_grad, + index_size, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } else if (compute_type == "MUL") { - ManipulateMeanGradCUDAKernelForMulE< - T, - IndexT><<>>( - x_data, - out_grad, - s_index, - d_index, - s_count, - thrust::raw_pointer_cast(l_bcastoff.data()), - thrust::raw_pointer_cast(r_bcastoff.data()), - e_grad, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast); + ManipulateMeanGradCUDAKernelForMulE + <<>>( + x_data, + out_grad, + s_index, + d_index, + s_count, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + e_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); } } } From be98048f4c35f18605ac5dc30566cf41f5fc7b1f Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 27 Jun 2022 02:07:36 +0000 Subject: [PATCH 21/51] temp commit --- .../gpu/graph_send_e_recv_grad_kernel.cu | 5 +- .../impl/graph_send_e_recv_kernel_impl.h | 35 ++++++++++ .../paddle/fluid/tests/unittests/op_test.py | 2 + .../unittests/test_graph_send_e_recv_op.py | 66 +++++++++++++------ 4 files changed, 84 insertions(+), 24 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index e23fa68f2f5ec..0f197a9e53397 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -20,6 +20,7 @@ #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" #include "paddle/phi/kernels/graph_send_e_recv_grad_kernel.h" #include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" +#include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { @@ -172,11 +173,7 @@ void CalculateXGrad(const Context& ctx, ManipulateMeanGradCUDAKernel<<>>( out_grad, d_index, s_index, x_grad, index_size, slice_size, s_count); } else if (compute_type == "MUL") { - const auto& broad_shape = phi::InferBroadcastShape(out_grad_dims, e_dims); const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); - DenseTensor out_grad_temp; - out_grad_temp.Resize(phi::make_ddim(bcast_info)); - ctx.template Alloc(&out_grad_temp); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h index b54f615947d3f..5e12ea25249c1 100644 --- a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h @@ -114,4 +114,39 @@ inline std::vector InferBroadcastShape(const phi::DDim& x_dims, return out_dims_array; } +inline std::vector GetGradReduceDim( + const std::vector& grad_dims, const std::vector& input_dims) { + // Whether to reduce the gradients. + // If there is broadcast in forward pass, gradients need to be reduced on + // broadcast dimension. + std::vector grad_shape(grad_dims.begin() + 1, grad_dims.end()); + std::vector input_shape(input_dims.begin() + 1, input_dims.end()); + std::vector reduce_idx; + bool need_reduce = false; + if (grad_shape.size() != input_shape.size()) { + need_reduce = true; + } else { + for (int i = 0; i < grad_shape.size(); i++) { + if (grad_shape[i] != input_shape[i]) { + need_reduce = true; + break; + } + } + } + if (!need_reduce) { + return reduce_idx; + } + + int num_to_squeeze = grad_shape.size() - input_shape.size(); + for (int i = 0; i < num_to_squeeze; i++) { + input_shape.insert(input_shape.begin(), 1); + } + for (int i = 0; i < input_shape.size(); i++) { + if (grad_shape[i] - input_shape[i] != 0) { + reduce_idx.push_back(i + 1); + } + } + return reduce_idx; +} + } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index ded9f188472dd..72dce09eb2c50 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -1774,6 +1774,7 @@ def check_output(self, self.__class__.use_xpu = True places = self._get_places() + places = places[1:] for place in places: res = self.check_output_with_place(place, atol, @@ -1860,6 +1861,7 @@ def check_grad(self, check_eager=False): self._check_grad_helper() places = self._get_places() + places = places[1:] for place in places: self.check_grad_with_place(place, inputs_to_check, diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py index 04008667f9efa..b0a9b6cc32c72 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py @@ -24,9 +24,13 @@ def get_broadcast_shape(shp1, shp2): pad_shp1, pad_shp2 = shp1, shp2 if len(shp1) > len(shp2): - pad_shp2 = [1, ] * (len(shp1) - len(shp2)) + shp2 + pad_shp2 = [ + 1, + ] * (len(shp1) - len(shp2)) + shp2 elif len(shp1) < len(shp2): - pad_shp1 = [1, ] * (len(shp2) - len(shp1)) + shp1 + pad_shp1 = [ + 1, + ] * (len(shp2) - len(shp1)) + shp1 for d1, d2 in zip(pad_shp1, pad_shp2): if d1 != d2 and d1 != 1 and d2 != 1: raise ValueError @@ -42,7 +46,9 @@ def compute_graph_send_e_recv_for_sum(inputs, attributes): compute_type = attributes['compute_type'] gather_x = x[src_index] - out_shp = [x.shape[0], ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) + out_shp = [ + x.shape[0], + ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) results = np.zeros(out_shp, dtype=x.dtype) # Calculate forward output @@ -63,7 +69,9 @@ def compute_graph_send_e_recv_for_mean(inputs, attributes): compute_type = attributes['compute_type'] gather_x = x[src_index] - out_shp = [x.shape[0], ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) + out_shp = [ + x.shape[0], + ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) results = np.zeros(out_shp, dtype=x.dtype) # Calculate forward output @@ -75,12 +83,13 @@ def compute_graph_send_e_recv_for_mean(inputs, attributes): for index, s_id in enumerate(dst_index): results[s_id, :] += x_compute_e[index, :] count[s_id] += 1 - results = results / count.reshape([-1, 1]) - results[np.isnan(results)] = 0 + results = results / count.reshape([-1, 1]) + results[np.isnan(results)] = 0 return results, count class TestGraphSendERecvSumOp(OpTest): + def setUp(self): paddle.enable_static() self.op_type = "graph_send_e_recv" @@ -113,6 +122,7 @@ def test_check_grad(self): class TestSumCase1(TestGraphSendERecvSumOp): + def set_config(self): self.x = np.random.random((100, 20)).astype("float64") self.e = np.random.random((150, 1)).astype("float64") @@ -123,46 +133,51 @@ def set_config(self): class TestSumCase2(TestGraphSendERecvSumOp): + def set_config(self): - self.x = np.random.random((100, 1)).astype("float64") + self.x = np.random.random((10, 20)).astype("float64") self.e = np.random.random((15, 20)).astype("float64") - index = np.random.randint(0, 100, (15, 2)).astype(np.int64) + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.compute_type = 'MUL' class TestSumCase3(TestGraphSendERecvSumOp): + def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") - index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.e = np.random.random((150, 1)).astype("float64") + index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] self.compute_type = 'MUL' class TestSumCase4(TestGraphSendERecvSumOp): + def set_config(self): - self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((150, 1)).astype("float64") - index = np.random.randint(0, 10, (150, 2)).astype(np.int64) + self.x = np.random.random((10, 8, 5)).astype("float64") + self.e = np.random.random((15, 8, 1)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.compute_type = 'ADD' class TestSumCase5(TestGraphSendERecvSumOp): + def set_config(self): - self.x = np.random.random((100, 1)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") - index = np.random.randint(0, 100, (15, 2)).astype(np.int64) + self.x = np.random.random((10, 8, 5)).astype("float64") + self.e = np.random.random((15, 8, 1)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] self.compute_type = 'MUL' class TestGraphSendERecvMeanOp(OpTest): + def setUp(self): paddle.enable_static() self.op_type = "graph_send_e_recv" @@ -175,8 +190,8 @@ def setUp(self): } self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MEAN'} - out, dst_count = compute_graph_send_e_recv_for_mean(self.inputs, - self.attrs) + out, dst_count = compute_graph_send_e_recv_for_mean( + self.inputs, self.attrs) self.outputs = {'Out': out, 'Dst_count': dst_count} @@ -193,3 +208,14 @@ def test_check_output(self): def test_check_grad(self): self.check_grad(['X', 'E'], 'Out') + + +def TestMeanCast1(TestGraphSendERecvSumOp): + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' From 18b5382740f8d8706c46a7dac7f956675e30664c Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Fri, 8 Jul 2022 09:12:19 +0000 Subject: [PATCH 22/51] add min max unittest --- .../unittests/test_graph_send_e_recv_op.py | 287 +++++++++++++++++- 1 file changed, 279 insertions(+), 8 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py index b0a9b6cc32c72..32a84d9d5d775 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py @@ -88,6 +88,57 @@ def compute_graph_send_e_recv_for_mean(inputs, attributes): return results, count +def compute_graph_send_e_recv_for_max_min(inputs, attributes): + x = inputs['X'] + e = inputs['E'] + src_index = inputs['Src_index'] + dst_index = inputs['Dst_index'] + compute_type = attributes['compute_type'] + pool_type = attributes['pool_type'] + + gather_x = x[src_index] + out_shp = [ + x.shape[0], + ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) + results = np.zeros(out_shp, dtype=x.dtype) + + # Calculate forward output + if compute_type == 'ADD': + x_compute_e = gather_x + e + elif compute_type == 'MUL': + x_compute_e = gather_x * e + + first_set = set() + if pool_type == 'MAX': + for index, s_id in enumerate(dst_index): + if s_id not in first_set: + results[s_id, :] += x_compute_e[index, :] + first_set.add(s_id) + else: + results[s_id, :] = np.maximum(results[s_id, :], + x_compute_e[index, :]) + elif pool_type == 'MIN': + for index, s_id in enumerate(dst_index): + if s_id not in first_set: + results[s_id, :] += x_compute_e[index, :] + first_set.add(s_id) + else: + results[s_id, :] = np.minimum(results[s_id, :], + x_compute_e[index, :]) + else: + raise ValueError("Invalid pool_type, only MAX, MIN supported!") + + # Calculate backward gradient + x_gradient = np.zeros_like(x) + e_gradient = np.zeros_like(e) + for i in range(len(src_index)): + forward_src_idx = src_index[i] + forward_dst_idx = dst_index[i] + # ??? + + return results + + class TestGraphSendERecvSumOp(OpTest): def setUp(self): @@ -124,23 +175,23 @@ def test_check_grad(self): class TestSumCase1(TestGraphSendERecvSumOp): def set_config(self): - self.x = np.random.random((100, 20)).astype("float64") - self.e = np.random.random((150, 1)).astype("float64") - index = np.random.randint(0, 100, (150, 2)).astype(np.int64) + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.compute_type = 'MUL' class TestSumCase2(TestGraphSendERecvSumOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") - index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.e = np.random.random((150, 1)).astype("float64") + index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.compute_type = 'ADD' class TestSumCase3(TestGraphSendERecvSumOp): @@ -210,7 +261,166 @@ def test_check_grad(self): self.check_grad(['X', 'E'], 'Out') -def TestMeanCast1(TestGraphSendERecvSumOp): +def TestMeanCase1(TestGraphSendERecvMeanOp): + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +def TestMeanCase2(TestGraphSendERecvMeanOp): + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((150, 1)).astype("float64") + index = np.random.randint(0, 10, (150, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'SUM' + + +def TestMeanCase3(TestGraphSendERecvMeanOp): + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((150, 1)).astype("float64") + index = np.random.randint(0, 10, (150, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +def TestMeanCase4(TestGraphSendERecvMeanOp): + + def set_config(self): + self.x = np.random.random((10, 8, 5)).astype("float64") + self.e = np.random.random((15, 8, 1)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'SUM' + + +def TestMeanCase5(TestGraphSendERecvMeanOp): + + def set_config(self): + self.x = np.random.random((10, 8, 5)).astype("float64") + self.e = np.random.random((15, 8, 1)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +class TestGraphSendERecvMaxOp(OpTest): + + def setUp(self): + paddle.enable_static() + self.op_type = "graph_send_e_recv" + self.set_config() + self.inputs = { + 'X': self.x, + 'E': self.e, + 'Src_index': self.src_index, + 'Dst_index': self.dst_index + } + self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MAX'} + + out = compute_graph_send_e_recv_for_max_min(self.inputs, self.attrs) + + self.outputs = {'Out': out} + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + def test_check_output(self): + self.check_output() + + # def test_check_grad(self): + # self.check_grad(['X', 'E'], 'Out') + + +class TestMaxCase1(TestGraphSendERecvMaxOp): + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +class TestMaxCase2(TestGraphSendERecvMaxOp): + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((150, 1)).astype("float64") + index = np.random.randint(0, 10, (150, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + +class TestMaxCase3(TestGraphSendERecvMaxOp): + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((150, 1)).astype("float64") + index = np.random.randint(0, 10, (150, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +class TestMaxCase4(TestGraphSendERecvMaxOp): + + def set_config(self): + self.x = np.random.random((10, 8, 5)).astype("float64") + self.e = np.random.random((15, 8, 1)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + +class TestMaxCase5(TestGraphSendERecvMaxOp): + + def set_config(self): + self.x = np.random.random((10, 8, 5)).astype("float64") + self.e = np.random.random((15, 8, 1)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +class TestGraphSendERecvMinOp(OpTest): + + def setUp(self): + paddle.enable_static() + self.op_type = "graph_send_e_recv" + self.set_config() + self.inputs = { + 'X': self.x, + 'E': self.e, + 'Src_index': self.src_index, + 'Dst_index': self.dst_index + } + self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MIN'} + + out = compute_graph_send_e_recv_for_max_min(self.inputs, self.attrs) + + self.outputs = {'Out': out} def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -218,4 +428,65 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + def test_check_output(self): + self.check_output() + + # def test_check_grad(self): + # self.check_grad(['X', 'E'], 'Out') + + +class TestMinCase1(TestGraphSendERecvMinOp): + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +class TestMinCase2(TestGraphSendERecvMinOp): + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((150, 1)).astype("float64") + index = np.random.randint(0, 10, (150, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + +class TestMinCase3(TestGraphSendERecvMinOp): + + def set_config(self): + self.x = np.random.random((10, 20)).astype("float64") + self.e = np.random.random((150, 1)).astype("float64") + index = np.random.randint(0, 10, (150, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + +class TestMinCase4(TestGraphSendERecvMinOp): + + def set_config(self): + self.x = np.random.random((10, 8, 5)).astype("float64") + self.e = np.random.random((15, 8, 1)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + +class TestMinCase5(TestGraphSendERecvMinOp): + + def set_config(self): + self.x = np.random.random((10, 8, 5)).astype("float64") + self.e = np.random.random((15, 8, 1)).astype("float64") + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] self.compute_type = 'MUL' From 81014e7562f65cfb3955360aac4c7f75973874f3 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 11 Jul 2022 10:16:52 +0000 Subject: [PATCH 23/51] add max, min unittest, fix mul bug --- .../phi/kernels/gpu/graph_send_e_recv_funcs.h | 6 +- .../gpu/graph_send_e_recv_grad_kernel.cu | 2 - .../unittests/test_graph_send_e_recv_op.py | 135 ++++++++++++++++-- 3 files changed, 125 insertions(+), 18 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h index 42b9c0fe2a1f9..6d95b813efb91 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h @@ -13,16 +13,16 @@ // limitations under the License. #pragma once -#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" - #include #include + #include #include #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" namespace phi { @@ -371,7 +371,7 @@ __global__ void ManipulateMinMaxGradCUDAKernelForMul(const T* x_data, T* e_grad_off = e_grad + ty * e_len; while (tx < out_len) { int64_t x_add = use_bcast ? xbcast_off[tx] : tx; - int64_t e_add = use_bcast ? xbcast_off[tx] : tx; + int64_t e_add = use_bcast ? ebcast_off[tx] : tx; T val = x_off[x_add] * e_off[e_add]; paddle::platform::CudaAtomicAdd( x_grad_off + x_add, diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 0f197a9e53397..8834bb69310bf 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -37,7 +37,6 @@ void CalculateXEGradForMinMax(const Context& ctx, const std::string& compute_type, const std::string& pool_type, int64_t index_size, - int64_t slice_size, T* x_grad, T* e_grad, const DenseTensor* out = nullptr) { @@ -388,7 +387,6 @@ void GraphSendERecvGradOpCUDAKernelLaunchHelper( compute_type, pool_type, index_size, - slice_size, x_grad_data, e_grad_data, out); diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py index 32a84d9d5d775..59bb992bcd1ea 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py @@ -38,6 +38,64 @@ def get_broadcast_shape(shp1, shp2): return rst +class BroadCastInfo(object): + + def __init__(self, x_shape, e_shape): + self.x_shape = x_shape + self.e_shape = e_shape + + self.calculate_bcastinfo() + + def use_bcast(self): + if len(self.x_shape) != len(self.e_shape): + return True + for i in range(1, len(self.x_shape)): + if self.x_shape[i] != self.e_shape[i]: + return True + return False + + def calculate_bcastinfo(self): + lhs_len = 1 + rhs_len = 1 + for i in range(1, len(self.x_shape)): + lhs_len *= self.x_shape[i] + for i in range(1, len(self.e_shape)): + rhs_len *= self.e_shape[i] + use_b = self.use_bcast() + + if use_b: + max_ndim = max(len(self.x_shape), len(self.e_shape)) - 1 + out_len = 1 + stride_l = stride_r = 1 + lhs_offset = [0] + rhs_offset = [0] + for j in range(0, max_ndim): + dl = 1 if (len(self.x_shape) - 1 - j) < 1 \ + else self.x_shape[len(self.x_shape) - 1 - j] + dr = 1 if (len(self.e_shape) - 1 - j) < 1 \ + else self.e_shape[len(self.e_shape) - 1 - j] + for i in range(1, max(dl, dr)): + for k in range(0, out_len): + lhs_offset.append(lhs_offset[k] + i * + (i < dl) * stride_l) + rhs_offset.append(rhs_offset[k] + i * + (i < dr) * stride_r) + + out_len *= max(dl, dr) + stride_l *= dl + stride_r *= dr + else: + out_len = rhs_len + + self.use_broadcast = use_b + self.out_len = out_len + self.lhs_len = lhs_len + self.rhs_len = rhs_len + if use_b: + self.lhs_offset = lhs_offset + self.rhs_offset = rhs_offset + + def compute_graph_send_e_recv_for_sum(inputs, attributes): x = inputs['X'] e = inputs['E'] @@ -51,7 +109,7 @@ def compute_graph_send_e_recv_for_sum(inputs, attributes): ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) results = np.zeros(out_shp, dtype=x.dtype) - # Calculate forward output + # Calculate forward output. if compute_type == 'ADD': x_compute_e = gather_x + e elif compute_type == 'MUL': @@ -74,7 +132,7 @@ def compute_graph_send_e_recv_for_mean(inputs, attributes): ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) results = np.zeros(out_shp, dtype=x.dtype) - # Calculate forward output + # Calculate forward output. if compute_type == 'ADD': x_compute_e = gather_x + e elif compute_type == 'MUL': @@ -102,7 +160,7 @@ def compute_graph_send_e_recv_for_max_min(inputs, attributes): ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) results = np.zeros(out_shp, dtype=x.dtype) - # Calculate forward output + # Calculate forward output. if compute_type == 'ADD': x_compute_e = gather_x + e elif compute_type == 'MUL': @@ -128,15 +186,64 @@ def compute_graph_send_e_recv_for_max_min(inputs, attributes): else: raise ValueError("Invalid pool_type, only MAX, MIN supported!") - # Calculate backward gradient + # Calculate backward gradient. x_gradient = np.zeros_like(x) e_gradient = np.zeros_like(e) + bcast_info = BroadCastInfo(x.shape, e.shape) + use_broadcast = bcast_info.use_broadcast for i in range(len(src_index)): forward_src_idx = src_index[i] forward_dst_idx = dst_index[i] - # ??? - - return results + x_off = x[forward_src_idx] + e_off = e[i] + out_off = results[forward_dst_idx] + x_grad_off = x_gradient[forward_src_idx] + e_grad_off = e_gradient[i] + for j in range(bcast_info.out_len): + x_add = bcast_info.lhs_offset[j] if use_broadcast else j + e_add = bcast_info.rhs_offset[j] if use_broadcast else j + if compute_type == 'ADD': + if len(x_off.shape) == 1 and len(e_off.shape) == 1: + val = x_off[x_add] + e_off[e_add] + x_grad_off[x_add] += 1 * (val == out_off[j]) + e_grad_off[e_add] += 1 * (val == out_off[j]) + else: + # For simplicity, we only check the situation of x_off.shape=2 + x_add_0 = int(x_add / x_off.shape[1]) + x_add_1 = int(x_add % x_off.shape[1]) + e_add_0 = int(e_add / e_off.shape[1]) + e_add_1 = int(e_add % e_off.shape[1]) + out_add_0 = int(j / out_off.shape[1]) + out_add_1 = int(j % out_off.shape[1]) + val = x_off[x_add_0][x_add_1] + e_off[e_add_0][e_add_1] + x_grad_off[x_add_0][x_add_1] += 1 * ( + val == out_off[out_add_0][out_add_1]) + e_grad_off[e_add_0][e_add_1] += 1 * ( + val == out_off[out_add_0][out_add_1]) + elif compute_type == 'MUL': + if len(x_off.shape) == 1 and len(e_off.shape) == 1: + val = x_off[x_add] * e_off[e_add] + x_grad_off[x_add] += 1 * (val == out_off[j]) * e_off[e_add] + e_grad_off[e_add] += 1 * (val == out_off[j]) * x_off[x_add] + else: + # For simplicity, we only check the situation of x_off.shape=2 + x_add_0 = int(x_add / x_off.shape[1]) + x_add_1 = int(x_add % x_off.shape[1]) + e_add_0 = int(e_add / e_off.shape[1]) + e_add_1 = int(e_add % e_off.shape[1]) + out_add_0 = int(j / out_off.shape[1]) + out_add_1 = int(j % out_off.shape[1]) + val = x_off[x_add_0][x_add_1] * e_off[e_add_0][e_add_1] + x_grad_off[x_add_0][x_add_1] += 1 * ( + val == out_off[out_add_0][out_add_1] + ) * e_off[e_add_0][e_add_1] + e_grad_off[e_add_0][e_add_1] += 1 * ( + val == out_off[out_add_0][out_add_1] + ) * x_off[x_add_0][x_add_1] + + gradients = [x_gradient / results.size, e_gradient / results.size] + + return results, gradients class TestGraphSendERecvSumOp(OpTest): @@ -330,7 +437,8 @@ def setUp(self): } self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MAX'} - out = compute_graph_send_e_recv_for_max_min(self.inputs, self.attrs) + out, self.gradients = compute_graph_send_e_recv_for_max_min( + self.inputs, self.attrs) self.outputs = {'Out': out} @@ -345,8 +453,8 @@ def set_config(self): def test_check_output(self): self.check_output() - # def test_check_grad(self): - # self.check_grad(['X', 'E'], 'Out') + def test_check_grad(self): + self.check_grad(['X', 'E'], 'Out', user_defined_grads=self.gradients) class TestMaxCase1(TestGraphSendERecvMaxOp): @@ -418,7 +526,8 @@ def setUp(self): } self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MIN'} - out = compute_graph_send_e_recv_for_max_min(self.inputs, self.attrs) + out, self.gradients = compute_graph_send_e_recv_for_max_min( + self.inputs, self.attrs) self.outputs = {'Out': out} @@ -433,8 +542,8 @@ def set_config(self): def test_check_output(self): self.check_output() - # def test_check_grad(self): - # self.check_grad(['X', 'E'], 'Out') + def test_check_grad(self): + self.check_grad(['X', 'E'], 'Out', user_defined_grads=self.gradients) class TestMinCase1(TestGraphSendERecvMinOp): From a02e07aae01b50239a2f4958496f5699cf7ecb79 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 12 Jul 2022 06:00:19 +0000 Subject: [PATCH 24/51] add cpu forward sum and mean --- .../phi/kernels/cpu/graph_send_e_recv_funcs.h | 46 +++++ .../kernels/cpu/graph_send_e_recv_kernel.cc | 170 ++++++++++++++++++ .../impl/graph_send_e_recv_kernel_impl.h | 35 ---- .../paddle/fluid/tests/unittests/op_test.py | 2 - 4 files changed, 216 insertions(+), 37 deletions(-) create mode 100644 paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h new file mode 100644 index 0000000000000..64c2000b47d09 --- /dev/null +++ b/paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h @@ -0,0 +1,46 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +struct GraphAddFunctor { + inline T operator()(const T a, const T b) const { return a + b; } +}; + +template +struct GraphMulFunctor { + inline T operator()(const T a, const T b) const { return a * b; } +}; + +template +struct GraphMaxfunctor { + inline T operator()(const T a, const T b) const { return a < b ? b : a; } +}; + +template +struct GraphMinFunctor { + inline T operator()(const T a, const T b) const { return a < b ? a : b; } +}; + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc index 0544a1e298b8e..851e805272de6 100644 --- a/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc @@ -11,3 +11,173 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +#include "paddle/phi/kernels/graph_send_e_recv_kernel.h" + +#include +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h" +#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" + +namespace phi { + +template +void GraphSendERecvSumCpuKernel(const BroadCastInfo& bcast, + const T* x_data, + const T* e_data, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + int64_t index_size, + ComputeFunctor cfunctor) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < index_size; i++) { + IndexT src = src_indices[i]; + IndexT dst = dst_indices[i]; + T* out_off = output + dst * bcast.out_len; + const T* x_off = x_data + src * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + for (int64_t j = 0; j < bcast.out_len; j++) { + int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = cfunctor(x_off[x_add], e_off[e_add]); + if (val != 0) { +#ifdef PADDLE_WITH_MKLML +#pragma omp atomic +#endif + out_off[j] += val; + } + } + } +} + +// template +// void GraphSendERecvMaxCpuKernel() + +template +void GraphSendERecvOpKernelLaunchHelper(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + DenseTensor* out, + DenseTensor* dst_count = nullptr) { + const int& index_size = src_index.dims()[0]; + ctx.template Alloc(out); + T* out_data = out->data(); + auto out_dims = out->dims(); + int64_t memset_size = 1; + for (int i = 0; i < out_dims.size(); i++) { + memset_size *= out_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); + memset(out_data, 0, memset_bytes); + + if (index_size == 0) return; + const auto& bcast_info = phi::CalcBCastInfo(x.dims(), e.dims()); + const T* x_data = x.data(); + const T* e_data = e.data(); + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + if (pool_type == "SUM" || pool_type == "MEAN") { + if (compute_type == "ADD") { + GraphAddFunctor add_functor; + GraphSendERecvSumCpuKernel>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + add_functor); + } else if (compute_type == "MUL") { + GraphMulFunctor mul_functor; + GraphSendERecvSumCpuKernel>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + mul_functor); + } + if (pool_type == "MEAN") { + int* dst_count_data = ctx.template Alloc(dst_count); + memset(dst_count_data, 0, dst_count->dims()[0] * sizeof(int)); + for (int i = 0; i < index_size; i++) { + IndexT dst_idx = d_index[i]; + dst_count_data[dst_idx] += 1; + } + for (int i = 0; i < out_dims[0]; i++) { + if (dst_count_data[i] == 0) continue; + auto out_slice = out->Slice(i, i + 1); + auto eigen_out = phi::EigenVector::Flatten(out_slice); + eigen_out = eigen_out / static_cast(dst_count_data[i]); + } + } + } else if (pool_type == "MIN" || pool_type == "MAX") { + /*if (compute_type == "ADD") { + GraphAddFunctor add_funtor; + } else if (compute_type == "MUL") { + GraphMulFunctor mul_functor; + }*/ + } +} + +template +void GraphSendERecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + DenseTensor* out, + DenseTensor* dst_count) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendERecvOpKernelLaunchHelper(ctx, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + out_size, + out, + dst_count); + } else if (index_type == phi::DataType::INT64) { + GraphSendERecvOpKernelLaunchHelper(ctx, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + out_size, + out, + dst_count); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_e_recv, + CPU, + ALL_LAYOUT, + phi::GraphSendERecvKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h index 5e12ea25249c1..b54f615947d3f 100644 --- a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h @@ -114,39 +114,4 @@ inline std::vector InferBroadcastShape(const phi::DDim& x_dims, return out_dims_array; } -inline std::vector GetGradReduceDim( - const std::vector& grad_dims, const std::vector& input_dims) { - // Whether to reduce the gradients. - // If there is broadcast in forward pass, gradients need to be reduced on - // broadcast dimension. - std::vector grad_shape(grad_dims.begin() + 1, grad_dims.end()); - std::vector input_shape(input_dims.begin() + 1, input_dims.end()); - std::vector reduce_idx; - bool need_reduce = false; - if (grad_shape.size() != input_shape.size()) { - need_reduce = true; - } else { - for (int i = 0; i < grad_shape.size(); i++) { - if (grad_shape[i] != input_shape[i]) { - need_reduce = true; - break; - } - } - } - if (!need_reduce) { - return reduce_idx; - } - - int num_to_squeeze = grad_shape.size() - input_shape.size(); - for (int i = 0; i < num_to_squeeze; i++) { - input_shape.insert(input_shape.begin(), 1); - } - for (int i = 0; i < input_shape.size(); i++) { - if (grad_shape[i] - input_shape[i] != 0) { - reduce_idx.push_back(i + 1); - } - } - return reduce_idx; -} - } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 72dce09eb2c50..ded9f188472dd 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -1774,7 +1774,6 @@ def check_output(self, self.__class__.use_xpu = True places = self._get_places() - places = places[1:] for place in places: res = self.check_output_with_place(place, atol, @@ -1861,7 +1860,6 @@ def check_grad(self, check_eager=False): self._check_grad_helper() places = self._get_places() - places = places[1:] for place in places: self.check_grad_with_place(place, inputs_to_check, From bb5c36698524789c7d2a80b15df6ebc9541f5389 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 12 Jul 2022 07:21:27 +0000 Subject: [PATCH 25/51] add forward min max, fix mean unittest --- .../phi/kernels/cpu/graph_send_e_recv_funcs.h | 2 +- .../kernels/cpu/graph_send_e_recv_kernel.cc | 113 ++++++++++++++++-- .../unittests/test_graph_send_e_recv_op.py | 18 +-- 3 files changed, 117 insertions(+), 16 deletions(-) diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h index 64c2000b47d09..7647415d8e7cb 100644 --- a/paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h +++ b/paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h @@ -34,7 +34,7 @@ struct GraphMulFunctor { }; template -struct GraphMaxfunctor { +struct GraphMaxFunctor { inline T operator()(const T a, const T b) const { return a < b ? b : a; } }; diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc index 851e805272de6..93f75f456c686 100644 --- a/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc @@ -38,7 +38,7 @@ void GraphSendERecvSumCpuKernel(const BroadCastInfo& bcast, #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif - for (int i = 0; i < index_size; i++) { + for (int64_t i = 0; i < index_size; i++) { IndexT src = src_indices[i]; IndexT dst = dst_indices[i]; T* out_off = output + dst * bcast.out_len; @@ -58,8 +58,51 @@ void GraphSendERecvSumCpuKernel(const BroadCastInfo& bcast, } } -// template -// void GraphSendERecvMaxCpuKernel() +template +void GraphSendERecvMinMaxCpuKernel(const BroadCastInfo& bcast, + const T* x_data, + const T* e_data, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + int64_t index_size, + ComputeFunctor cfunctor, + CmpFunctor pfunctor) { + std::set existed_dst; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < index_size; i++) { + IndexT src = src_indices[i]; + IndexT dst = dst_indices[i]; + T* out_off = output + dst * bcast.out_len; + const T* x_off = x_data + src * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + bool in_set = existed_dst.find(dst) != existed_dst.end(); + for (int64_t j = 0; j < bcast.out_len; j++) { + int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = cfunctor(x_off[x_add], e_off[e_add]); +#ifdef PADDLE_WITH_MKLML +#pragma omp critical +#endif + if (!in_set) { + out_off[j] += val; + } else { + out_off[j] = pfunctor(out_off[j], val); + } + } +#ifdef PADDLE_WITH_MKLML +#pragma omp critical +#endif + if (!in_set) { + existed_dst.emplace(dst); + } + } +} template void GraphSendERecvOpKernelLaunchHelper(const Context& ctx, @@ -125,12 +168,68 @@ void GraphSendERecvOpKernelLaunchHelper(const Context& ctx, eigen_out = eigen_out / static_cast(dst_count_data[i]); } } - } else if (pool_type == "MIN" || pool_type == "MAX") { - /*if (compute_type == "ADD") { - GraphAddFunctor add_funtor; + } else if (pool_type == "MIN") { + GraphMinFunctor min_functor; + if (compute_type == "ADD") { + GraphAddFunctor add_functor; + GraphSendERecvMinMaxCpuKernel, + GraphMinFunctor>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + add_functor, + min_functor); } else if (compute_type == "MUL") { GraphMulFunctor mul_functor; - }*/ + GraphSendERecvMinMaxCpuKernel, + GraphMinFunctor>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + mul_functor, + min_functor); + } + } else if (pool_type == "MAX") { + GraphMaxFunctor max_functor; + if (compute_type == "ADD") { + GraphAddFunctor add_functor; + GraphSendERecvMinMaxCpuKernel, + GraphMaxFunctor>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + add_functor, + max_functor); + } else if (compute_type == "MUL") { + GraphMulFunctor mul_functor; + GraphSendERecvMinMaxCpuKernel, + GraphMaxFunctor>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + mul_functor, + max_functor); + } } } diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py index 59bb992bcd1ea..14ed8fc3b45e3 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py @@ -141,7 +141,9 @@ def compute_graph_send_e_recv_for_mean(inputs, attributes): for index, s_id in enumerate(dst_index): results[s_id, :] += x_compute_e[index, :] count[s_id] += 1 - results = results / count.reshape([-1, 1]) + count_shape = [out_shp[0]] + count_shape.extend([1] * len(out_shp[1:])) + results = results / count.reshape(count_shape) results[np.isnan(results)] = 0 return results, count @@ -368,7 +370,7 @@ def test_check_grad(self): self.check_grad(['X', 'E'], 'Out') -def TestMeanCase1(TestGraphSendERecvMeanOp): +class TestMeanCase1(TestGraphSendERecvMeanOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -379,7 +381,7 @@ def set_config(self): self.compute_type = 'MUL' -def TestMeanCase2(TestGraphSendERecvMeanOp): +class TestMeanCase2(TestGraphSendERecvMeanOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -387,10 +389,10 @@ def set_config(self): index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'SUM' + self.compute_type = 'ADD' -def TestMeanCase3(TestGraphSendERecvMeanOp): +class TestMeanCase3(TestGraphSendERecvMeanOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -401,7 +403,7 @@ def set_config(self): self.compute_type = 'MUL' -def TestMeanCase4(TestGraphSendERecvMeanOp): +class TestMeanCase4(TestGraphSendERecvMeanOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") @@ -409,10 +411,10 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'SUM' + self.compute_type = 'ADD' -def TestMeanCase5(TestGraphSendERecvMeanOp): +class TestMeanCase5(TestGraphSendERecvMeanOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") From fb10fb490bccc6a4259a53b2e41849f92ded511d Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 12 Jul 2022 09:05:48 +0000 Subject: [PATCH 26/51] add cpu backward min max --- .../cpu/graph_send_e_recv_grad_kernel.cc | 292 ++++++++++++++++++ .../gpu/graph_send_e_recv_grad_kernel.cu | 13 +- 2 files changed, 295 insertions(+), 10 deletions(-) diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc index 0544a1e298b8e..26c0cc08cc074 100644 --- a/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc @@ -11,3 +11,295 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + +#include "paddle/phi/kernels/graph_send_e_recv_grad_kernel.h" + +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h" +#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" + +namespace phi { + +template +void CalculateXGrad(const T* out_grad, + const T* x_data, + const T* e_data, + const phi::DDim& out_grad_dims, + const phi::DDim& x_dims, + const phi::DDim& e_dims, + const IndexT* s_index, + const IndexT* d_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t index_size, + int64_t slice_size, + T* x_grad, + const DenseTensor* dst_count = nullptr, + const DenseTensor* out = nullptr) { + if (pool_type == "SUM") { + if (compute_type == "ADD") { + } else if (compute_type == "MUL") { + } + } else if (pool_type == "MEAN") { + if (compute_type == "ADD") { + } else if (compute_type == "MUL") { + } + } +} + +template +void CalculateEGrad(const T* out_grad_data, + const T* x_data, + const T* e_data, + const phi::DDim& x_dims, + const phi::DDim& e_dims, + const IndexT* s_index, + const IndexT* d_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t index_size, + T* e_grad, + const DenseTensor* dst_count = nullptr) { + const auto& bcast = phi::CalcBCastInfo(x_dims, e_dims); + if (pool_type == "SUM") { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + const T* x_off = x_data + src * bcast.l_len; + const T* out_grad_off = out_grad_data + dst * bcast.out_len; + T* e_grad_off = e_grad + i * bcast.r_len; + for (int64_t j = 0; j < bcast.out_len; j++) { + int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + if (compute_type == "ADD") { +#ifdef PADDLE_WITH_MKLML +#pragma omp atomic +#endif + e_grad_off[e_add] += out_grad_off[j]; + } else if (compute_type == "MUL") { +#ifdef PADDLE_WITH_MKLML +#pragma omp atomic +#endif + e_grad_off[e_add] += (out_grad_off[j] * x_off[x_add]); + } + } + } + } else if (pool_type == "MEAN") { + const int* s_count = dst_count->data(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + const T* x_off = x_data + src * bcast.l_len; + const T* out_grad_off = out_grad_data + dst * bcast.out_len; + T* e_grad_off = e_grad + i * bcast.r_len; + for (int64_t j = 0; j < bcast.out_len; j++) { + int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + if (compute_type == "ADD") { +#ifdef PADDLE_WITH_MKLML +#pragma omp atomic +#endif + e_grad_off[e_add] += (out_grad_off[j] / s_count[dst]); + } else if (compute_type == "MUL") { +#ifdef PADDLE_WITH_MKLML +#pragma omp atomic +#endif + e_grad_off[e_add] += (out_grad_off[j] * x_off[x_add] / s_count[dst]); + } + } + } + } +} + +template +void CalculateXEGradForMinMax(const T* out_grad, + const T* x_data, + const T* e_data, + const phi::DDim& x_dims, + const phi::DDim& e_dims, + const IndexT* s_index, + const IndexT* d_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t index_size, + T* x_grad, + T* e_grad, + const DenseTensor* out = nullptr) { + const T* out_data = out->data(); + const auto& bcast = phi::CalcBCastInfo(x_dims, e_dims); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + const T* x_off = x_data + dst * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + const T* out_off = out_data + src * bcast.out_len; + const T* out_grad_off = out_grad + src * bcast.out_len; + T* x_grad_off = x_grad + dst * bcast.l_len; + T* e_grad_off = e_grad + i * bcast.r_len; + for (int64_t j = 0; j < bcast.out_len; j++) { + int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + if (compute_type == "ADD") { + T val = x_off[x_add] + e_off[e_add]; +#ifdef PADDLE_WITH_MKLML +#pragma omp critical +#endif + x_grad_off[x_add] += (out_grad_off[j] * (val == out_off[j])); + e_grad_off[e_add] += (out_grad_off[j] * (val == out_off[j])); + } else if (compute_type == "MUL") { + T val = x_off[x_add] * e_off[e_add]; +#ifdef PADDLE_WITH_MKLML +#pragma omp critical +#endif + x_grad_off[x_add] += + (out_grad_off[j] * (val == out_off[j]) * e_off[e_add]); + e_grad_off[e_add] += + (out_grad_off[j] * (val == out_off[j]) * x_off[x_add]); + } + } + } +} + +template +void GraphSendERecvGradOpKernelLaunchHelper( + const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + DenseTensor* x_grad, + DenseTensor* e_grad, + const DenseTensor* dst_count = nullptr, + const DenseTensor* out = nullptr) { + const int& index_size = dst_index.dims()[0]; + + ctx.template Alloc(x_grad); + T* x_grad_data = x_grad->data(); + ctx.template Alloc(e_grad); + T* e_grad_data = e_grad->data(); + const auto& x_dims = x.dims(); + const auto& e_dims = e.dims(); + int64_t memset_size_x = 1, memset_size_e = 1; + int64_t slice_size = 1; + for (int i = 0; i < x_dims.size(); i++) { + memset_size_x *= x_dims[i]; + if (i > 0) slice_size *= x_dims[i]; + } + for (int i = 0; i < e_dims.size(); i++) { + memset_size_e *= e_dims[i]; + } + const size_t& memset_bytes_x = memset_size_x * sizeof(T); + const size_t& memset_bytes_e = memset_size_e * sizeof(T); + memset(x_grad_data, 0, memset_bytes_x); + memset(e_grad_data, 0, memset_bytes_e); + + if (index_size == 0) return; + + const T* out_grad_data = out_grad.data(); + const T* x_data = x.data(); + const T* e_data = e.data(); + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + + if (pool_type == "SUM" || pool_type == "MEAN") { + CalculateEGrad(out_grad_data, + x_data, + e_data, + x_dims, + e_dims, + s_index, + d_index, + compute_type, + pool_type, + index_size, + e_grad_data, + dst_count); + } else if (pool_type == "MIN" || pool_type == "MAX") { + CalculateXEGradForMinMax(out_grad_data, + x_data, + e_data, + x_dims, + e_dims, + d_index, + s_index, + compute_type, + pool_type, + index_size, + x_grad_data, + e_grad_data, + out); + } +} + +template +void GraphSendERecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const paddle::optional& out, + const paddle::optional& dst_count, + const DenseTensor& out_grad, + const std::string& compute_type, + const std::string& pool_type, + DenseTensor* x_grad, + DenseTensor* e_grad) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendERecvGradOpKernelLaunchHelper( + ctx, + out_grad, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + x_grad, + e_grad, + dst_count.get_ptr(), + out.get_ptr()); + } else if (index_type == phi::DataType::INT64) { + GraphSendERecvGradOpKernelLaunchHelper( + ctx, + out_grad, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + x_grad, + e_grad, + dst_count.get_ptr(), + out.get_ptr()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_e_recv_grad, + CPU, + ALL_LAYOUT, + phi::GraphSendERecvGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 8834bb69310bf..6c3b791308417 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -12,15 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/graph_send_e_recv_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h" #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" -#include "paddle/phi/kernels/graph_send_e_recv_grad_kernel.h" #include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" -#include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { @@ -29,7 +28,6 @@ void CalculateXEGradForMinMax(const Context& ctx, const T* out_grad, const T* x_data, const T* e_data, - const phi::DDim& out_grad_dims, const phi::DDim& x_dims, const phi::DDim& e_dims, const IndexT* s_index, @@ -208,7 +206,6 @@ void CalculateEGrad(const Context& ctx, const T* out_grad, const T* x_data, const T* e_data, - const phi::DDim& out_grad_dims, const phi::DDim& x_dims, const phi::DDim& e_dims, const IndexT* s_index, @@ -217,8 +214,7 @@ void CalculateEGrad(const Context& ctx, const std::string& pool_type, int64_t index_size, T* e_grad, - const DenseTensor* dst_count = nullptr, - const DenseTensor* out = nullptr) { + const DenseTensor* dst_count = nullptr) { const auto& bcast_info = phi::CalcBCastInfo(x_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { @@ -363,7 +359,6 @@ void GraphSendERecvGradOpCUDAKernelLaunchHelper( out_grad_data, x_data, e_data, - out_grad.dims(), x_dims, e_dims, s_index, @@ -372,14 +367,12 @@ void GraphSendERecvGradOpCUDAKernelLaunchHelper( pool_type, index_size, e_grad_data, - dst_count, - out); + dst_count); } else if (pool_type == "MIN" || pool_type == "MAX") { CalculateXEGradForMinMax(ctx, out_grad_data, x_data, e_data, - out_grad.dims(), x_dims, e_dims, s_index, From e59e516bd64f912a5c38bd148a2c4779344164aa Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 12 Jul 2022 09:20:41 +0000 Subject: [PATCH 27/51] fix code-style --- .../fluid/operators/graph_send_e_recv_op.cc | 6 +- .../kernels/gpu/graph_send_e_recv_kernel.cu | 244 +++++++++--------- 2 files changed, 126 insertions(+), 124 deletions(-) diff --git a/paddle/fluid/operators/graph_send_e_recv_op.cc b/paddle/fluid/operators/graph_send_e_recv_op.cc index 994153f50af81..2b760b4508a9c 100644 --- a/paddle/fluid/operators/graph_send_e_recv_op.cc +++ b/paddle/fluid/operators/graph_send_e_recv_op.cc @@ -134,9 +134,11 @@ class GraphSendERecvGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(graph_send_e_recv, GraphSendERecvInferShapeFunctor, +DECLARE_INFER_SHAPE_FUNCTOR(graph_send_e_recv, + GraphSendERecvInferShapeFunctor, PD_INFER_META(phi::GraphSendERecvInferMeta)); -REGISTER_OPERATOR(graph_send_e_recv, ops::GraphSendERecvOP, +REGISTER_OPERATOR(graph_send_e_recv, + ops::GraphSendERecvOP, ops::GraphSendERecvOpMaker, ops::GraphSendERecvGradOpMaker, ops::GraphSendERecvGradOpMaker, diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu index 375e7209bf3a9..7bf6523e60747 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/graph_send_e_recv_kernel.h" #include "paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h" #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" -#include "paddle/phi/kernels/graph_send_e_recv_kernel.h" #include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" #include @@ -99,46 +99,46 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, GraphSendERecvSumCUDAFunctor sum_functor; if (compute_type == "ADD") { funcs::AddFunctor add_funtor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvSumCUDAFunctor, - funcs::AddFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - thrust::raw_pointer_cast(x_bcastoff.data()), - thrust::raw_pointer_cast(e_bcastoff.data()), - out_data, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast, - add_funtor, - sum_functor); + GraphSendERecvCUDAKernel, + funcs::AddFunctor> + <<>>( + x_data, + e_data, + s_index, + d_index, + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), + out_data, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast, + add_funtor, + sum_functor); } else if (compute_type == "MUL") { funcs::MultiplyFunctor mul_functor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvSumCUDAFunctor, - funcs::MultiplyFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - thrust::raw_pointer_cast(x_bcastoff.data()), - thrust::raw_pointer_cast(e_bcastoff.data()), - out_data, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast, - mul_functor, - sum_functor); + GraphSendERecvCUDAKernel, + funcs::MultiplyFunctor> + <<>>( + x_data, + e_data, + s_index, + d_index, + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), + out_data, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast, + mul_functor, + sum_functor); } if (pool_type == "MEAN") { ctx.template Alloc(dst_count); @@ -152,9 +152,9 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, cudaMemset(dst_count_data, 0, input_size * sizeof(int)); #endif int64_t grid_count = (index_size + block_ - 1) / block_; - ComputeCountCUDAKernel<<>>( - dst_count_data, d_index, index_size); + ComputeCountCUDAKernel + <<>>( + dst_count_data, d_index, index_size); int64_t grid_mean = (input_size * out_len + block_ - 1) / block_; int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; @@ -167,46 +167,46 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, GraphSendERecvMaxCUDAFunctor max_functor; if (compute_type == "ADD") { funcs::AddFunctor add_funtor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvMaxCUDAFunctor, - funcs::AddFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - thrust::raw_pointer_cast(x_bcastoff.data()), - thrust::raw_pointer_cast(e_bcastoff.data()), - out_data, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast, - add_funtor, - max_functor); + GraphSendERecvCUDAKernel, + funcs::AddFunctor> + <<>>( + x_data, + e_data, + s_index, + d_index, + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), + out_data, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast, + add_funtor, + max_functor); } else if (compute_type == "MUL") { funcs::MultiplyFunctor mul_functor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvMaxCUDAFunctor, - funcs::MultiplyFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - thrust::raw_pointer_cast(x_bcastoff.data()), - thrust::raw_pointer_cast(e_bcastoff.data()), - out_data, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast, - mul_functor, - max_functor); + GraphSendERecvCUDAKernel, + funcs::MultiplyFunctor> + <<>>( + x_data, + e_data, + s_index, + d_index, + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), + out_data, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast, + mul_functor, + max_functor); } if (out_size > 0) { input_size = out_size; @@ -214,52 +214,52 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, int64_t grid_max = (input_size * out_len + block_ - 1) / block_; int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_max_ = grid_max < max_grid_dimx ? grid_max : max_grid_dimx; - InputResetMaxCUDAKernel<<>>( - out_data, input_size, out_len); + InputResetMaxCUDAKernel + <<>>(out_data, input_size, out_len); } else if (pool_type == "MIN") { GraphSendERecvMinCUDAFunctor min_functor; if (compute_type == "ADD") { funcs::AddFunctor add_funtor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvMinCUDAFunctor, - funcs::AddFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - thrust::raw_pointer_cast(x_bcastoff.data()), - thrust::raw_pointer_cast(e_bcastoff.data()), - out_data, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast, - add_funtor, - min_functor); + GraphSendERecvCUDAKernel, + funcs::AddFunctor> + <<>>( + x_data, + e_data, + s_index, + d_index, + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), + out_data, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast, + add_funtor, + min_functor); } else if (compute_type == "MUL") { funcs::MultiplyFunctor mul_functor; - GraphSendERecvCUDAKernel< - T, - IndexT, - GraphSendERecvMinCUDAFunctor, - funcs::MultiplyFunctor><<>>( - x_data, - e_data, - s_index, - d_index, - thrust::raw_pointer_cast(x_bcastoff.data()), - thrust::raw_pointer_cast(e_bcastoff.data()), - out_data, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast, - mul_functor, - min_functor); + GraphSendERecvCUDAKernel, + funcs::MultiplyFunctor> + <<>>( + x_data, + e_data, + s_index, + d_index, + thrust::raw_pointer_cast(x_bcastoff.data()), + thrust::raw_pointer_cast(e_bcastoff.data()), + out_data, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast, + mul_functor, + min_functor); } if (out_size > 0) { input_size = out_size; @@ -267,8 +267,8 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, int64_t grid_min = (input_size * out_len + block_ - 1) / block_; int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_min_ = grid_min < max_grid_dimx ? grid_min : max_grid_dimx; - InputResetMinCUDAKernel<<>>( - out_data, input_size, out_len); + InputResetMinCUDAKernel + <<>>(out_data, input_size, out_len); } } From de7782e56820e6738afd9035653405640842fee1 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 13 Jul 2022 02:49:33 +0000 Subject: [PATCH 28/51] add backward sum mean --- .../cpu/graph_send_e_recv_grad_kernel.cc | 79 ++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc index 26c0cc08cc074..0c1a306e53f8c 100644 --- a/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc @@ -21,6 +21,7 @@ #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h" +#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" #include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" namespace phi { @@ -37,17 +38,77 @@ void CalculateXGrad(const T* out_grad, const std::string& compute_type, const std::string& pool_type, int64_t index_size, - int64_t slice_size, T* x_grad, + const DenseTensor& out_grad_tensor, + DenseTensor* x_grad_tensor, const DenseTensor* dst_count = nullptr, const DenseTensor* out = nullptr) { if (pool_type == "SUM") { if (compute_type == "ADD") { + GraphSendRecvSumFunctor sum_functor; + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + ElementwiseInnerOperation>( + out_grad_tensor, x_grad_tensor, src, dst, false, sum_functor); + } } else if (compute_type == "MUL") { + const auto& bcast = phi::CalcBCastInfo(out_grad_dims, e_dims); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + T* x_grad_off = x_grad + dst * bcast.out_len; + const T* out_grad_off = out_grad + src * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + for (int j = 0; j < bcast.out_len; j++) { + int64_t o_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = out_grad_off[o_add] * e_off[e_add]; + if (val != 0) { +#ifdef PADDLE_WITH_MKLML +#pragma omp atomic +#endif + x_grad_off[j] += val; + } + } + } } } else if (pool_type == "MEAN") { + const int* s_count = dst_count->data(); if (compute_type == "ADD") { + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + auto out_grad_slice = out_grad_tensor.Slice(src, src + 1); + auto x_grad_slice = x_grad_tensor->Slice(dst, dst + 1); + auto eigen_out_grad = phi::EigenVector::Flatten(out_grad_slice); + auto eigen_x_grad = phi::EigenVector::Flatten(x_grad_slice); + eigen_x_grad += (eigen_out_grad / static_cast(s_count[src])); + } } else if (compute_type == "MUL") { + const auto& bcast = phi::CalcBCastInfo(out_grad_dims, e_dims); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + const T* out_grad_off = out_grad + src * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + T* x_grad_off = x_grad + dst * bcast.out_len; + for (int64_t j = 0; j < bcast.out_len; j++) { + int64_t o_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = out_grad_off[o_add] * e_off[e_add]; +#ifdef PADDLE_WITH_MKLML +#pragma omp atomic +#endif + x_grad_off[j] += (val / s_count[src]); + } + } } } } @@ -219,6 +280,22 @@ void GraphSendERecvGradOpKernelLaunchHelper( const IndexT* d_index = dst_index.data(); if (pool_type == "SUM" || pool_type == "MEAN") { + CalculateXGrad(out_grad_data, + x_data, + e_data, + out_grad.dims(), + x_dims, + e_dims, + d_index, + s_index, + compute_type, + pool_type, + index_size, + x_grad_data, + out_grad, + x_grad, + dst_count, + out); CalculateEGrad(out_grad_data, x_data, e_data, From cd10b9e7c901ed1fdc9646c92a0212cb11e3b79a Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 13 Jul 2022 04:49:06 +0000 Subject: [PATCH 29/51] fix rocm ci --- paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h index 6d95b813efb91..34d5f22d165b6 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h @@ -33,6 +33,16 @@ inline void CopyBCastOff(const BroadCastInfo& bcast_info, thrust::device_vector& r_bcastoff) { l_bcastoff.resize(bcast_info.out_len); r_bcastoff.resize(bcast_info.out_len); +#ifdef PADDLE_WITH_HIP + hipMemcpy(thrust::raw_pointer_cast(l_bcastoff.data()), + bcast_info.l_offset.data(), + sizeof(int64_t) * bcast_info.out_len, + hipMemcpyHostToDevice); + hipMemcpy(thrust::raw_pointer_cast(r_bcastoff.data()), + bcast_info.r_offset.data(), + sizeof(int64_t) * bcast_info.out_len, + hipMemcpyHostToDevice); +#else cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff.data()), bcast_info.l_offset.data(), sizeof(int64_t) * bcast_info.out_len, @@ -41,6 +51,7 @@ inline void CopyBCastOff(const BroadCastInfo& bcast_info, bcast_info.r_offset.data(), sizeof(int64_t) * bcast_info.out_len, cudaMemcpyHostToDevice); +#endif } inline int FindNumThreads(int dim, int max_num_threads = CUDA_MAX_NUM_THREADS) { From 737da405ac2b3a91276f68b0f853f2b8f3238b86 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 13 Jul 2022 08:44:19 +0000 Subject: [PATCH 30/51] set uniitest timeout --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 606f39c5e3b42..05181d4834e79 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1536,6 +1536,7 @@ set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120) set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120) set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120) set_tests_properties(test_split_program PROPERTIES TIMEOUT 120) +set_tests_properties(test_graph_send_e_recv_op PROPERTIES TIMEOUT 60) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) From 5f6e0b585e05127cce80c5c5017883ca5442306f Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 19 Jul 2022 13:51:30 +0000 Subject: [PATCH 31/51] fix bug of x broadcast to e, gpu grad --- .../gpu/graph_send_e_recv_grad_kernel.cu | 229 ++++++++++++++---- .../impl/graph_send_e_recv_kernel_impl.h | 22 ++ .../unittests/test_graph_send_e_recv_op.py | 88 +++++++ 3 files changed, 295 insertions(+), 44 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 6c3b791308417..85ff70a4af840 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -16,10 +16,13 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h" #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" #include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" +#include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { @@ -107,6 +110,7 @@ void CalculateXGrad(const Context& ctx, int64_t index_size, int64_t slice_size, T* x_grad, + const DenseTensor& out_grad_tensor, const DenseTensor* dst_count = nullptr, const DenseTensor* out = nullptr) { #ifdef PADDLE_WITH_HIP @@ -118,17 +122,50 @@ void CalculateXGrad(const Context& ctx, int max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; int64_t grid_tmp = (n + block - 1) / block; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; + std::vector reduce_idx; + bool reduce = ReduceGrad(out_grad_dims, x_dims, reduce_idx); if (pool_type == "SUM") { if (compute_type == "ADD") { GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel> - <<>>(out_grad, - d_index, - s_index, - x_grad, - index_size, - slice_size, - functor); + if (!reduce) { + GraphSendRecvCUDAKernel> + <<>>(out_grad, + d_index, + s_index, + x_grad, + index_size, + slice_size, + functor); + } else { + const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); + DenseTensor x_grad_v2 = + phi::EmptyLike(ctx, out_grad_tensor); + phi::funcs::SetConstant()(ctx, &x_grad_v2, T(0)); + T* x_grad_v2_data = x_grad_v2.data(); + GraphSendRecvCUDAKernel> + <<>>(out_grad, + d_index, + s_index, + x_grad_v2_data, + index_size, + bcast_info.out_len, + functor); + // Run reduce_sum + DenseTensor x_grad_out = phi::Sum( + ctx, + x_grad_v2, + reduce_idx, + paddle::experimental::CppTypeToDataType::Type(), + true); + cudaMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + cudaMemcpyDeviceToDevice); + } } else if (compute_type == "MUL") { const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; @@ -144,31 +181,100 @@ void CalculateXGrad(const Context& ctx, const dim3 block_(ntx, nty); funcs::MultiplyFunctor mul_functor; GraphSendERecvSumCUDAFunctor sum_functor; - GraphSendERecvCUDAKernel, - funcs::MultiplyFunctor> - <<>>( - out_grad, - e_data, - d_index, - s_index, - thrust::raw_pointer_cast(l_bcastoff.data()), - thrust::raw_pointer_cast(r_bcastoff.data()), - x_grad, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast, - mul_functor, - sum_functor); + if (!reduce) { + GraphSendERecvCUDAKernel, + funcs::MultiplyFunctor> + <<>>( + out_grad, + e_data, + d_index, + s_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast, + mul_functor, + sum_functor); + } else { + DenseTensor x_grad_v2 = + phi::EmptyLike(ctx, out_grad_tensor); + phi::funcs::SetConstant()(ctx, &x_grad_v2, T(0)); + T* x_grad_v2_data = x_grad_v2.data(); + GraphSendERecvCUDAKernel, + funcs::MultiplyFunctor> + <<>>( + out_grad, + e_data, + d_index, + s_index, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad_v2_data, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast, + mul_functor, + sum_functor); + DenseTensor x_grad_out = phi::Sum( + ctx, + x_grad_v2, + reduce_idx, + paddle::experimental::CppTypeToDataType::Type(), + true); + cudaMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + cudaMemcpyDeviceToDevice); + } } } else if (pool_type == "MEAN") { const int* s_count = dst_count->data(); if (compute_type == "ADD") { - ManipulateMeanGradCUDAKernel<<>>( - out_grad, d_index, s_index, x_grad, index_size, slice_size, s_count); + if (!reduce) { + ManipulateMeanGradCUDAKernel + <<>>(out_grad, + d_index, + s_index, + x_grad, + index_size, + slice_size, + s_count); + } else { + const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); + DenseTensor x_grad_v2 = + phi::EmptyLike(ctx, out_grad_tensor); + phi::funcs::SetConstant()(ctx, &x_grad_v2, T(0)); + T* x_grad_v2_data = x_grad_v2.data(); + ManipulateMeanGradCUDAKernel + <<>>(out_grad, + d_index, + s_index, + x_grad_v2_data, + index_size, + bcast_info.out_len, + s_count); + // Run reduce_sum + DenseTensor x_grad_out = phi::Sum( + ctx, + x_grad_v2, + reduce_idx, + paddle::experimental::CppTypeToDataType::Type(), + true); + cudaMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + cudaMemcpyDeviceToDevice); + } } else if (compute_type == "MUL") { const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; @@ -182,21 +288,55 @@ void CalculateXGrad(const Context& ctx, const int nby = (index_size + nty - 1) / nty; const dim3 grid_(nbx, nby); const dim3 block_(ntx, nty); - ManipulateMeanGradCUDAKernelForMulX - <<>>( - out_grad, - e_data, - d_index, - s_index, - s_count, - thrust::raw_pointer_cast(l_bcastoff.data()), - thrust::raw_pointer_cast(r_bcastoff.data()), - x_grad, - index_size, - bcast_info.l_len, - bcast_info.r_len, - out_len, - bcast_info.use_bcast); + if (!reduce) { + ManipulateMeanGradCUDAKernelForMulX + <<>>( + out_grad, + e_data, + d_index, + s_index, + s_count, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); + } else { + DenseTensor x_grad_v2 = + phi::EmptyLike(ctx, out_grad_tensor); + phi::funcs::SetConstant()(ctx, &x_grad_v2, T(0)); + T* x_grad_v2_data = x_grad_v2.data(); + ManipulateMeanGradCUDAKernelForMulX + <<>>( + out_grad, + e_data, + d_index, + s_index, + s_count, + thrust::raw_pointer_cast(l_bcastoff.data()), + thrust::raw_pointer_cast(r_bcastoff.data()), + x_grad_v2_data, + index_size, + bcast_info.l_len, + bcast_info.r_len, + out_len, + bcast_info.use_bcast); + // Run reduce_sum + DenseTensor x_grad_out = phi::Sum( + ctx, + x_grad_v2, + reduce_idx, + paddle::experimental::CppTypeToDataType::Type(), + true); + // TODO(daisiming): Whether use x_grad instead. + cudaMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + cudaMemcpyDeviceToDevice); + } } } } @@ -353,6 +493,7 @@ void GraphSendERecvGradOpCUDAKernelLaunchHelper( index_size, slice_size, x_grad_data, + out_grad, dst_count, out); CalculateEGrad(ctx, diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h index b54f615947d3f..35e51fb930c8d 100644 --- a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h +++ b/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h @@ -114,4 +114,26 @@ inline std::vector InferBroadcastShape(const phi::DDim& x_dims, return out_dims_array; } +inline bool ReduceGrad(const phi::DDim& out_grad_dims, + const phi::DDim& x_dims, + std::vector& axis) { + // We must ensure the ndim of out_grad and x are the same. + bool reduce = false; + for (int i = 1; i < out_grad_dims.size(); i++) { + if (out_grad_dims[i] != x_dims[i]) { + reduce = true; + break; + } + } + if (!reduce) return false; + + // Get reduce axis. + for (int i = 1; i < out_grad_dims.size(); i++) { + if (out_grad_dims[i] - x_dims[i] != 0) { + axis.emplace_back(i); + } + } + return true; +} + } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py index 14ed8fc3b45e3..c936ca20dbc99 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py @@ -336,6 +336,28 @@ def set_config(self): self.compute_type = 'MUL' +class TestSumCase6(TestGraphSendERecvSumOp): + + def set_config(self): + self.x = np.random.random((100, 1)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 100, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + +class TestSumCase7(TestGraphSendERecvSumOp): + + def set_config(self): + self.x = np.random.random((100, 1)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 100, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + class TestGraphSendERecvMeanOp(OpTest): def setUp(self): @@ -425,6 +447,28 @@ def set_config(self): self.compute_type = 'MUL' +class TestMeanCase6(TestGraphSendERecvMeanOp): + + def set_config(self): + self.x = np.random.random((100, 1)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 100, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + +class TestMeanCase7(TestGraphSendERecvMeanOp): + + def set_config(self): + self.x = np.random.random((100, 1)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 100, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + class TestGraphSendERecvMaxOp(OpTest): def setUp(self): @@ -514,6 +558,28 @@ def set_config(self): self.compute_type = 'MUL' +class TestMaxCase6(TestGraphSendERecvMaxOp): + + def set_config(self): + self.x = np.random.random((100, 1)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 100, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + +class TestMaxCase7(TestGraphSendERecvMaxOp): + + def set_config(self): + self.x = np.random.random((100, 1)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 100, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' + + class TestGraphSendERecvMinOp(OpTest): def setUp(self): @@ -601,3 +667,25 @@ def set_config(self): self.src_index = index[:, 0] self.dst_index = index[:, 1] self.compute_type = 'MUL' + + +class TestMinCase6(TestGraphSendERecvMinOp): + + def set_config(self): + self.x = np.random.random((100, 1)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 100, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'ADD' + + +class TestMinCase7(TestGraphSendERecvMinOp): + + def set_config(self): + self.x = np.random.random((100, 1)).astype("float64") + self.e = np.random.random((15, 20)).astype("float64") + index = np.random.randint(0, 100, (15, 2)).astype(np.int64) + self.src_index = index[:, 0] + self.dst_index = index[:, 1] + self.compute_type = 'MUL' From ba4a65a09fede2e1c2463625030b16427082cdaa Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 20 Jul 2022 03:33:36 +0000 Subject: [PATCH 32/51] fix bug of x broadcast to e, cpu grad --- .../cpu/graph_send_e_recv_grad_kernel.cc | 225 +++++++++++++----- 1 file changed, 171 insertions(+), 54 deletions(-) diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc index 0c1a306e53f8c..124c4beb0bde0 100644 --- a/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc @@ -22,12 +22,16 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h" #include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" +#include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" +#include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { -template -void CalculateXGrad(const T* out_grad, +template +void CalculateXGrad(const Context& ctx, + const T* out_grad, const T* x_data, const T* e_data, const phi::DDim& out_grad_dims, @@ -43,71 +47,183 @@ void CalculateXGrad(const T* out_grad, DenseTensor* x_grad_tensor, const DenseTensor* dst_count = nullptr, const DenseTensor* out = nullptr) { + std::vector reduce_idx; + bool reduce = ReduceGrad(out_grad_dims, x_dims, reduce_idx); + if (pool_type == "SUM") { if (compute_type == "ADD") { GraphSendRecvSumFunctor sum_functor; - for (int64_t i = 0; i < index_size; i++) { - IndexT src = s_index[i]; - IndexT dst = d_index[i]; - ElementwiseInnerOperation>( - out_grad_tensor, x_grad_tensor, src, dst, false, sum_functor); + if (!reduce) { + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + ElementwiseInnerOperation>( + out_grad_tensor, x_grad_tensor, src, dst, false, sum_functor); + } + } else { + DenseTensor x_grad_v2 = + phi::EmptyLike(ctx, out_grad_tensor); + phi::funcs::SetConstant()(ctx, &x_grad_v2, T(0)); + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + ElementwiseInnerOperation>( + out_grad_tensor, &x_grad_v2, src, dst, false, sum_functor); + } + DenseTensor x_grad_out = phi::Sum( + ctx, + x_grad_v2, + reduce_idx, + paddle::experimental::CppTypeToDataType::Type(), + true); + memcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T)); } } else if (compute_type == "MUL") { const auto& bcast = phi::CalcBCastInfo(out_grad_dims, e_dims); + if (!reduce) { #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif - for (int64_t i = 0; i < index_size; i++) { - IndexT src = s_index[i]; - IndexT dst = d_index[i]; - T* x_grad_off = x_grad + dst * bcast.out_len; - const T* out_grad_off = out_grad + src * bcast.l_len; - const T* e_off = e_data + i * bcast.r_len; - for (int j = 0; j < bcast.out_len; j++) { - int64_t o_add = bcast.use_bcast ? bcast.l_offset[j] : j; - int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - T val = out_grad_off[o_add] * e_off[e_add]; - if (val != 0) { + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + T* x_grad_off = x_grad + dst * bcast.out_len; + const T* out_grad_off = out_grad + src * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + for (int j = 0; j < bcast.out_len; j++) { + int64_t o_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = out_grad_off[o_add] * e_off[e_add]; + if (val != 0) { #ifdef PADDLE_WITH_MKLML #pragma omp atomic #endif - x_grad_off[j] += val; + x_grad_off[j] += val; + } } } + } else { + DenseTensor x_grad_v2 = + phi::EmptyLike(ctx, out_grad_tensor); + phi::funcs::SetConstant()(ctx, &x_grad_v2, T(0)); + T* x_grad_v2_data = x_grad_v2.data(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + T* x_grad_off = x_grad_v2_data + dst * bcast.out_len; + const T* out_grad_off = out_grad + src * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + for (int j = 0; j < bcast.out_len; j++) { + int64_t o_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = out_grad_off[o_add] * e_off[e_add]; + if (val != 0) { +#ifdef PADDLE_WITH_MKLML +#pragma omp atomic +#endif + x_grad_off[j] += val; + } + } + } + DenseTensor x_grad_out = phi::Sum( + ctx, + x_grad_v2, + reduce_idx, + paddle::experimental::CppTypeToDataType::Type(), + true); + memcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T)); } } } else if (pool_type == "MEAN") { const int* s_count = dst_count->data(); if (compute_type == "ADD") { - for (int64_t i = 0; i < index_size; i++) { - IndexT src = s_index[i]; - IndexT dst = d_index[i]; - auto out_grad_slice = out_grad_tensor.Slice(src, src + 1); - auto x_grad_slice = x_grad_tensor->Slice(dst, dst + 1); - auto eigen_out_grad = phi::EigenVector::Flatten(out_grad_slice); - auto eigen_x_grad = phi::EigenVector::Flatten(x_grad_slice); - eigen_x_grad += (eigen_out_grad / static_cast(s_count[src])); + if (!reduce) { + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + auto out_grad_slice = out_grad_tensor.Slice(src, src + 1); + auto x_grad_slice = x_grad_tensor->Slice(dst, dst + 1); + auto eigen_out_grad = phi::EigenVector::Flatten(out_grad_slice); + auto eigen_x_grad = phi::EigenVector::Flatten(x_grad_slice); + eigen_x_grad += (eigen_out_grad / static_cast(s_count[src])); + } + } else { + DenseTensor x_grad_v2 = + phi::EmptyLike(ctx, out_grad_tensor); + phi::funcs::SetConstant()(ctx, &x_grad_v2, T(0)); + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + auto out_grad_slice = out_grad_tensor.Slice(src, src + 1); + auto x_grad_slice = x_grad_v2.Slice(dst, dst + 1); + auto eigen_out_grad = phi::EigenVector::Flatten(out_grad_slice); + auto eigen_x_grad = phi::EigenVector::Flatten(x_grad_slice); + eigen_x_grad += (eigen_out_grad / static_cast(s_count[src])); + } + DenseTensor x_grad_out = phi::Sum( + ctx, + x_grad_v2, + reduce_idx, + paddle::experimental::CppTypeToDataType::Type(), + true); + memcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T)); } } else if (compute_type == "MUL") { const auto& bcast = phi::CalcBCastInfo(out_grad_dims, e_dims); + if (!reduce) { #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif - for (int64_t i = 0; i < index_size; i++) { - IndexT src = s_index[i]; - IndexT dst = d_index[i]; - const T* out_grad_off = out_grad + src * bcast.l_len; - const T* e_off = e_data + i * bcast.r_len; - T* x_grad_off = x_grad + dst * bcast.out_len; - for (int64_t j = 0; j < bcast.out_len; j++) { - int64_t o_add = bcast.use_bcast ? bcast.l_offset[j] : j; - int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - T val = out_grad_off[o_add] * e_off[e_add]; + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + const T* out_grad_off = out_grad + src * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + T* x_grad_off = x_grad + dst * bcast.out_len; + for (int64_t j = 0; j < bcast.out_len; j++) { + int64_t o_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = out_grad_off[o_add] * e_off[e_add]; #ifdef PADDLE_WITH_MKLML #pragma omp atomic #endif - x_grad_off[j] += (val / s_count[src]); + x_grad_off[j] += (val / s_count[src]); + } } + } else { + DenseTensor x_grad_v2 = + phi::EmptyLike(ctx, out_grad_tensor); + phi::funcs::SetConstant()(ctx, &x_grad_v2, T(0)); + T* x_grad_v2_data = x_grad_v2.data(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < index_size; i++) { + IndexT src = s_index[i]; + IndexT dst = d_index[i]; + const T* out_grad_off = out_grad + src * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + T* x_grad_off = x_grad_v2_data + dst * bcast.out_len; + for (int64_t j = 0; j < bcast.out_len; j++) { + int64_t o_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = out_grad_off[o_add] * e_off[e_add]; +#ifdef PADDLE_WITH_MKLML +#pragma omp atomic +#endif + x_grad_off[j] += (val / s_count[src]); + } + } + DenseTensor x_grad_out = phi::Sum( + ctx, + x_grad_v2, + reduce_idx, + paddle::experimental::CppTypeToDataType::Type(), + true); + memcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T)); } } } @@ -280,22 +396,23 @@ void GraphSendERecvGradOpKernelLaunchHelper( const IndexT* d_index = dst_index.data(); if (pool_type == "SUM" || pool_type == "MEAN") { - CalculateXGrad(out_grad_data, - x_data, - e_data, - out_grad.dims(), - x_dims, - e_dims, - d_index, - s_index, - compute_type, - pool_type, - index_size, - x_grad_data, - out_grad, - x_grad, - dst_count, - out); + CalculateXGrad(ctx, + out_grad_data, + x_data, + e_data, + out_grad.dims(), + x_dims, + e_dims, + d_index, + s_index, + compute_type, + pool_type, + index_size, + x_grad_data, + out_grad, + x_grad, + dst_count, + out); CalculateEGrad(out_grad_data, x_data, e_data, From 575ab03174c3b17b075881c64a20e2f7f5b8376f Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 20 Jul 2022 12:47:52 +0000 Subject: [PATCH 33/51] rename BOOST_GET_CONST macro --- paddle/fluid/operators/graph_send_e_recv_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/graph_send_e_recv_op.cc b/paddle/fluid/operators/graph_send_e_recv_op.cc index 2b760b4508a9c..6c93279be6e81 100644 --- a/paddle/fluid/operators/graph_send_e_recv_op.cc +++ b/paddle/fluid/operators/graph_send_e_recv_op.cc @@ -113,12 +113,12 @@ class GraphSendERecvGradOpMaker : public framework::SingleGradOpMaker { op->SetInput("Src_index", this->Input("Src_index")); op->SetInput("Dst_index", this->Input("Dst_index")); - if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") { + if (PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") { op->SetInput("Dst_count", this->Output("Dst_count")); } - if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" || - BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") { + if (PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" || + PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") { op->SetInput("Out", this->Output("Out")); } From 10b5cc7820248f391e32426b69c9a482a1e55236 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Fri, 22 Jul 2022 09:50:43 +0000 Subject: [PATCH 34/51] fix rocm ci --- .../gpu/graph_send_e_recv_grad_kernel.cu | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu index 85ff70a4af840..7782061c3f345 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu @@ -161,10 +161,17 @@ void CalculateXGrad(const Context& ctx, reduce_idx, paddle::experimental::CppTypeToDataType::Type(), true); +#ifdef PADDLE_WITH_HIP + hipMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + hipMemcpyDeviceToDevice); +#else cudaMemcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T), cudaMemcpyDeviceToDevice); +#endif } } else if (compute_type == "MUL") { const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); @@ -231,10 +238,17 @@ void CalculateXGrad(const Context& ctx, reduce_idx, paddle::experimental::CppTypeToDataType::Type(), true); +#ifdef PADDLE_WITH_HIP + hipMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + hipMemcpyDeviceToDevice); +#else cudaMemcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T), cudaMemcpyDeviceToDevice); +#endif } } } else if (pool_type == "MEAN") { @@ -270,10 +284,17 @@ void CalculateXGrad(const Context& ctx, reduce_idx, paddle::experimental::CppTypeToDataType::Type(), true); +#ifdef PADDLE_WITH_HIP + hipMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + hipMemcpyDeviceToDevice); +#else cudaMemcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T), cudaMemcpyDeviceToDevice); +#endif } } else if (compute_type == "MUL") { const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); @@ -332,10 +353,17 @@ void CalculateXGrad(const Context& ctx, paddle::experimental::CppTypeToDataType::Type(), true); // TODO(daisiming): Whether use x_grad instead. +#ifdef PADDLE_WITH_HIP + hipMemcpy(x_grad, + x_grad_out.data(), + x_grad_out.numel() * sizeof(T), + hipMemcpyDeviceToDevice); +#else cudaMemcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T), cudaMemcpyDeviceToDevice); +#endif } } } From b6e2c27aed851191a76c3377f4d47f0290d999d9 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 25 Jul 2022 08:02:13 +0000 Subject: [PATCH 35/51] mv graph_send_e_recv to graph_send_ue_recv --- ..._e_recv_op.cc => graph_send_ue_recv_op.cc} | 32 +- paddle/phi/infermeta/multiary.cc | 18 +- paddle/phi/infermeta/multiary.h | 18 +- .../kernels/cpu/graph_send_e_recv_kernel.cc | 282 ------------------ ...ecv_funcs.h => graph_send_ue_recv_funcs.h} | 0 ...l.cc => graph_send_ue_recv_grad_kernel.cc} | 40 +-- .../kernels/cpu/graph_send_ue_recv_kernel.cc | 282 ++++++++++++++++++ ...ecv_funcs.h => graph_send_ue_recv_funcs.h} | 36 +-- ...l.cu => graph_send_ue_recv_grad_kernel.cu} | 58 ++-- ...kernel.cu => graph_send_ue_recv_kernel.cu} | 144 ++++----- ...nel.h => graph_send_ue_recv_grad_kernel.h} | 24 +- ...v_kernel.h => graph_send_ue_recv_kernel.h} | 20 +- ...mpl.h => graph_send_ue_recv_kernel_impl.h} | 0 ..._recv_sig.cc => graph_send_ue_recv_sig.cc} | 16 +- .../fluid/tests/unittests/CMakeLists.txt | 2 +- ...cv_op.py => test_graph_send_ue_recv_op.py} | 86 +++--- python/setup.py.in | 2 + 17 files changed, 531 insertions(+), 529 deletions(-) rename paddle/fluid/operators/{graph_send_e_recv_op.cc => graph_send_ue_recv_op.cc} (82%) delete mode 100644 paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc rename paddle/phi/kernels/cpu/{graph_send_e_recv_funcs.h => graph_send_ue_recv_funcs.h} (100%) rename paddle/phi/kernels/cpu/{graph_send_e_recv_grad_kernel.cc => graph_send_ue_recv_grad_kernel.cc} (93%) create mode 100644 paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc rename paddle/phi/kernels/gpu/{graph_send_e_recv_funcs.h => graph_send_ue_recv_funcs.h} (93%) rename paddle/phi/kernels/gpu/{graph_send_e_recv_grad_kernel.cu => graph_send_ue_recv_grad_kernel.cu} (92%) rename paddle/phi/kernels/gpu/{graph_send_e_recv_kernel.cu => graph_send_ue_recv_kernel.cu} (75%) rename paddle/phi/kernels/{graph_send_e_recv_grad_kernel.h => graph_send_ue_recv_grad_kernel.h} (52%) rename paddle/phi/kernels/{graph_send_e_recv_kernel.h => graph_send_ue_recv_kernel.h} (60%) rename paddle/phi/kernels/impl/{graph_send_e_recv_kernel_impl.h => graph_send_ue_recv_kernel_impl.h} (100%) rename paddle/phi/ops/compat/{graph_send_e_recv_sig.cc => graph_send_ue_recv_sig.cc} (72%) rename python/paddle/fluid/tests/unittests/{test_graph_send_e_recv_op.py => test_graph_send_ue_recv_op.py} (91%) diff --git a/paddle/fluid/operators/graph_send_e_recv_op.cc b/paddle/fluid/operators/graph_send_ue_recv_op.cc similarity index 82% rename from paddle/fluid/operators/graph_send_e_recv_op.cc rename to paddle/fluid/operators/graph_send_ue_recv_op.cc index 6c93279be6e81..398ff80eb5b0b 100644 --- a/paddle/fluid/operators/graph_send_e_recv_op.cc +++ b/paddle/fluid/operators/graph_send_ue_recv_op.cc @@ -20,7 +20,7 @@ namespace paddle { namespace operators { -class GraphSendERecvOP : public framework::OperatorWithKernel { +class GraphSendUERecvOP : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -33,7 +33,7 @@ class GraphSendERecvOP : public framework::OperatorWithKernel { } }; -class GraphSendERecvGradOp : public framework::OperatorWithKernel { +class GraphSendUERecvGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -53,7 +53,7 @@ class GraphSendERecvGradOp : public framework::OperatorWithKernel { } }; -class GraphSendERecvOpMaker : public framework::OpProtoAndCheckerMaker { +class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", @@ -62,7 +62,7 @@ class GraphSendERecvOpMaker : public framework::OpProtoAndCheckerMaker { "The input edge weight tensor, data type should be same with X"); AddInput("Src_index", "The source index tensor."); AddInput("Dst_index", "The destination index tensor."); - AddOutput("Out", "Output tensor of graph_send_e_recv op."); + AddOutput("Out", "Output tensor of graph_send_ue_recv op."); AddOutput("Dst_count", "Count tensor of Dst_index, mainly for MEAN pool_type.") .AsIntermediate(); @@ -101,13 +101,13 @@ tensor in different pooling types, like sum, mean, max, or min. }; template -class GraphSendERecvGradOpMaker : public framework::SingleGradOpMaker { +class GraphSendUERecvGradOpMaker : public framework::SingleGradOpMaker { public: using framework::SingleGradOpMaker::SingleGradOpMaker; protected: void Apply(GradOpPtr op) const override { - op->SetType("graph_send_e_recv_grad"); + op->SetType("graph_send_ue_recv_grad"); op->SetInput("X", this->Input("X")); op->SetInput("E", this->Input("E")); op->SetInput("Src_index", this->Input("Src_index")); @@ -134,13 +134,13 @@ class GraphSendERecvGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(graph_send_e_recv, - GraphSendERecvInferShapeFunctor, - PD_INFER_META(phi::GraphSendERecvInferMeta)); -REGISTER_OPERATOR(graph_send_e_recv, - ops::GraphSendERecvOP, - ops::GraphSendERecvOpMaker, - ops::GraphSendERecvGradOpMaker, - ops::GraphSendERecvGradOpMaker, - GraphSendERecvInferShapeFunctor); -REGISTER_OPERATOR(graph_send_e_recv_grad, ops::GraphSendERecvGradOp); +DECLARE_INFER_SHAPE_FUNCTOR(graph_send_ue_recv, + GraphSendUERecvInferShapeFunctor, + PD_INFER_META(phi::GraphSendUERecvInferMeta)); +REGISTER_OPERATOR(graph_send_ue_recv, + ops::GraphSendUERecvOP, + ops::GraphSendUERecvOpMaker, + ops::GraphSendUERecvGradOpMaker, + ops::GraphSendUERecvGradOpMaker, + GraphSendUERecvInferShapeFunctor); +REGISTER_OPERATOR(graph_send_ue_recv_grad, ops::GraphSendUERecvGradOp); diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index f58f26bf2101a..821fc7e0a60ea 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2349,15 +2349,15 @@ void Yolov3LossInferMeta(const MetaTensor& x, gt_match_mask->set_dtype(x.dtype()); } -void GraphSendERecvInferMeta(const MetaTensor& x, - const MetaTensor& e, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, - int64_t out_size, - MetaTensor* out, - MetaTensor* dst_count) { +void GraphSendUERecvInferMeta(const MetaTensor& x, + const MetaTensor& e, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + MetaTensor* out, + MetaTensor* dst_count) { auto src_index_dims = src_index.dims(); if (src_index_dims.size() == 2) { PADDLE_ENFORCE_EQ(src_index_dims[1], diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 12cc32780bfed..724b28230ee40 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -403,14 +403,14 @@ void Yolov3LossInferMeta(const MetaTensor& x, MetaTensor* objectness_mask, MetaTensor* gt_match_mask); -void GraphSendERecvInferMeta(const MetaTensor& x, - const MetaTensor& e, - const MetaTensor& src_index, - const MetaTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, - int64_t out_size, - MetaTensor* out, - MetaTensor* dst_count); +void GraphSendUERecvInferMeta(const MetaTensor& x, + const MetaTensor& e, + const MetaTensor& src_index, + const MetaTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + MetaTensor* out, + MetaTensor* dst_count); } // namespace phi diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc deleted file mode 100644 index 93f75f456c686..0000000000000 --- a/paddle/phi/kernels/cpu/graph_send_e_recv_kernel.cc +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/graph_send_e_recv_kernel.h" - -#include -#include -#include - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h" -#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" - -namespace phi { - -template -void GraphSendERecvSumCpuKernel(const BroadCastInfo& bcast, - const T* x_data, - const T* e_data, - const IndexT* src_indices, - const IndexT* dst_indices, - T* output, - int64_t index_size, - ComputeFunctor cfunctor) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int64_t i = 0; i < index_size; i++) { - IndexT src = src_indices[i]; - IndexT dst = dst_indices[i]; - T* out_off = output + dst * bcast.out_len; - const T* x_off = x_data + src * bcast.l_len; - const T* e_off = e_data + i * bcast.r_len; - for (int64_t j = 0; j < bcast.out_len; j++) { - int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; - int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - T val = cfunctor(x_off[x_add], e_off[e_add]); - if (val != 0) { -#ifdef PADDLE_WITH_MKLML -#pragma omp atomic -#endif - out_off[j] += val; - } - } - } -} - -template -void GraphSendERecvMinMaxCpuKernel(const BroadCastInfo& bcast, - const T* x_data, - const T* e_data, - const IndexT* src_indices, - const IndexT* dst_indices, - T* output, - int64_t index_size, - ComputeFunctor cfunctor, - CmpFunctor pfunctor) { - std::set existed_dst; -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int64_t i = 0; i < index_size; i++) { - IndexT src = src_indices[i]; - IndexT dst = dst_indices[i]; - T* out_off = output + dst * bcast.out_len; - const T* x_off = x_data + src * bcast.l_len; - const T* e_off = e_data + i * bcast.r_len; - bool in_set = existed_dst.find(dst) != existed_dst.end(); - for (int64_t j = 0; j < bcast.out_len; j++) { - int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; - int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - T val = cfunctor(x_off[x_add], e_off[e_add]); -#ifdef PADDLE_WITH_MKLML -#pragma omp critical -#endif - if (!in_set) { - out_off[j] += val; - } else { - out_off[j] = pfunctor(out_off[j], val); - } - } -#ifdef PADDLE_WITH_MKLML -#pragma omp critical -#endif - if (!in_set) { - existed_dst.emplace(dst); - } - } -} - -template -void GraphSendERecvOpKernelLaunchHelper(const Context& ctx, - const DenseTensor& x, - const DenseTensor& e, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, - int64_t out_size, - DenseTensor* out, - DenseTensor* dst_count = nullptr) { - const int& index_size = src_index.dims()[0]; - ctx.template Alloc(out); - T* out_data = out->data(); - auto out_dims = out->dims(); - int64_t memset_size = 1; - for (int i = 0; i < out_dims.size(); i++) { - memset_size *= out_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - memset(out_data, 0, memset_bytes); - - if (index_size == 0) return; - const auto& bcast_info = phi::CalcBCastInfo(x.dims(), e.dims()); - const T* x_data = x.data(); - const T* e_data = e.data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - if (pool_type == "SUM" || pool_type == "MEAN") { - if (compute_type == "ADD") { - GraphAddFunctor add_functor; - GraphSendERecvSumCpuKernel>(bcast_info, - x_data, - e_data, - s_index, - d_index, - out_data, - index_size, - add_functor); - } else if (compute_type == "MUL") { - GraphMulFunctor mul_functor; - GraphSendERecvSumCpuKernel>(bcast_info, - x_data, - e_data, - s_index, - d_index, - out_data, - index_size, - mul_functor); - } - if (pool_type == "MEAN") { - int* dst_count_data = ctx.template Alloc(dst_count); - memset(dst_count_data, 0, dst_count->dims()[0] * sizeof(int)); - for (int i = 0; i < index_size; i++) { - IndexT dst_idx = d_index[i]; - dst_count_data[dst_idx] += 1; - } - for (int i = 0; i < out_dims[0]; i++) { - if (dst_count_data[i] == 0) continue; - auto out_slice = out->Slice(i, i + 1); - auto eigen_out = phi::EigenVector::Flatten(out_slice); - eigen_out = eigen_out / static_cast(dst_count_data[i]); - } - } - } else if (pool_type == "MIN") { - GraphMinFunctor min_functor; - if (compute_type == "ADD") { - GraphAddFunctor add_functor; - GraphSendERecvMinMaxCpuKernel, - GraphMinFunctor>(bcast_info, - x_data, - e_data, - s_index, - d_index, - out_data, - index_size, - add_functor, - min_functor); - } else if (compute_type == "MUL") { - GraphMulFunctor mul_functor; - GraphSendERecvMinMaxCpuKernel, - GraphMinFunctor>(bcast_info, - x_data, - e_data, - s_index, - d_index, - out_data, - index_size, - mul_functor, - min_functor); - } - } else if (pool_type == "MAX") { - GraphMaxFunctor max_functor; - if (compute_type == "ADD") { - GraphAddFunctor add_functor; - GraphSendERecvMinMaxCpuKernel, - GraphMaxFunctor>(bcast_info, - x_data, - e_data, - s_index, - d_index, - out_data, - index_size, - add_functor, - max_functor); - } else if (compute_type == "MUL") { - GraphMulFunctor mul_functor; - GraphSendERecvMinMaxCpuKernel, - GraphMaxFunctor>(bcast_info, - x_data, - e_data, - s_index, - d_index, - out_data, - index_size, - mul_functor, - max_functor); - } - } -} - -template -void GraphSendERecvKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& e, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, - int64_t out_size, - DenseTensor* out, - DenseTensor* dst_count) { - auto index_type = src_index.dtype(); - if (index_type == phi::DataType::INT32) { - GraphSendERecvOpKernelLaunchHelper(ctx, - x, - e, - src_index, - dst_index, - compute_type, - pool_type, - out_size, - out, - dst_count); - } else if (index_type == phi::DataType::INT64) { - GraphSendERecvOpKernelLaunchHelper(ctx, - x, - e, - src_index, - dst_index, - compute_type, - pool_type, - out_size, - out, - dst_count); - } -} - -} // namespace phi - -PD_REGISTER_KERNEL(graph_send_e_recv, - CPU, - ALL_LAYOUT, - phi::GraphSendERecvKernel, - float, - double, - int, - int64_t) {} diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h similarity index 100% rename from paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h rename to paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h diff --git a/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc similarity index 93% rename from paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc rename to paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc index 124c4beb0bde0..65bcee4d53dc0 100644 --- a/paddle/phi/kernels/cpu/graph_send_e_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_e_recv_grad_kernel.h" +#include "paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h" #include #include @@ -20,11 +20,11 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/graph_send_e_recv_funcs.h" #include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" +#include "paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" +#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h" #include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { @@ -352,7 +352,7 @@ void CalculateXEGradForMinMax(const T* out_grad, } template -void GraphSendERecvGradOpKernelLaunchHelper( +void GraphSendUERecvGradOpKernelLaunchHelper( const Context& ctx, const DenseTensor& out_grad, const DenseTensor& x, @@ -443,21 +443,21 @@ void GraphSendERecvGradOpKernelLaunchHelper( } template -void GraphSendERecvGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& e, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const paddle::optional& out, - const paddle::optional& dst_count, - const DenseTensor& out_grad, - const std::string& compute_type, - const std::string& pool_type, - DenseTensor* x_grad, - DenseTensor* e_grad) { +void GraphSendUERecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const paddle::optional& out, + const paddle::optional& dst_count, + const DenseTensor& out_grad, + const std::string& compute_type, + const std::string& pool_type, + DenseTensor* x_grad, + DenseTensor* e_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { - GraphSendERecvGradOpKernelLaunchHelper( + GraphSendUERecvGradOpKernelLaunchHelper( ctx, out_grad, x, @@ -471,7 +471,7 @@ void GraphSendERecvGradKernel(const Context& ctx, dst_count.get_ptr(), out.get_ptr()); } else if (index_type == phi::DataType::INT64) { - GraphSendERecvGradOpKernelLaunchHelper( + GraphSendUERecvGradOpKernelLaunchHelper( ctx, out_grad, x, @@ -489,10 +489,10 @@ void GraphSendERecvGradKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_e_recv_grad, +PD_REGISTER_KERNEL(graph_send_ue_recv_grad, CPU, ALL_LAYOUT, - phi::GraphSendERecvGradKernel, + phi::GraphSendUERecvGradKernel, float, double, int, diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc new file mode 100644 index 0000000000000..b78ff7fa7763e --- /dev/null +++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc @@ -0,0 +1,282 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/graph_send_ue_recv_kernel.h" + +#include +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h" +#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h" + +namespace phi { + +template +void GraphSendUERecvSumCpuKernel(const BroadCastInfo& bcast, + const T* x_data, + const T* e_data, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + int64_t index_size, + ComputeFunctor cfunctor) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < index_size; i++) { + IndexT src = src_indices[i]; + IndexT dst = dst_indices[i]; + T* out_off = output + dst * bcast.out_len; + const T* x_off = x_data + src * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + for (int64_t j = 0; j < bcast.out_len; j++) { + int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = cfunctor(x_off[x_add], e_off[e_add]); + if (val != 0) { +#ifdef PADDLE_WITH_MKLML +#pragma omp atomic +#endif + out_off[j] += val; + } + } + } +} + +template +void GraphSendUERecvMinMaxCpuKernel(const BroadCastInfo& bcast, + const T* x_data, + const T* e_data, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + int64_t index_size, + ComputeFunctor cfunctor, + CmpFunctor pfunctor) { + std::set existed_dst; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < index_size; i++) { + IndexT src = src_indices[i]; + IndexT dst = dst_indices[i]; + T* out_off = output + dst * bcast.out_len; + const T* x_off = x_data + src * bcast.l_len; + const T* e_off = e_data + i * bcast.r_len; + bool in_set = existed_dst.find(dst) != existed_dst.end(); + for (int64_t j = 0; j < bcast.out_len; j++) { + int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; + int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = cfunctor(x_off[x_add], e_off[e_add]); +#ifdef PADDLE_WITH_MKLML +#pragma omp critical +#endif + if (!in_set) { + out_off[j] += val; + } else { + out_off[j] = pfunctor(out_off[j], val); + } + } +#ifdef PADDLE_WITH_MKLML +#pragma omp critical +#endif + if (!in_set) { + existed_dst.emplace(dst); + } + } +} + +template +void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + DenseTensor* out, + DenseTensor* dst_count = nullptr) { + const int& index_size = src_index.dims()[0]; + ctx.template Alloc(out); + T* out_data = out->data(); + auto out_dims = out->dims(); + int64_t memset_size = 1; + for (int i = 0; i < out_dims.size(); i++) { + memset_size *= out_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); + memset(out_data, 0, memset_bytes); + + if (index_size == 0) return; + const auto& bcast_info = phi::CalcBCastInfo(x.dims(), e.dims()); + const T* x_data = x.data(); + const T* e_data = e.data(); + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + if (pool_type == "SUM" || pool_type == "MEAN") { + if (compute_type == "ADD") { + GraphAddFunctor add_functor; + GraphSendUERecvSumCpuKernel>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + add_functor); + } else if (compute_type == "MUL") { + GraphMulFunctor mul_functor; + GraphSendUERecvSumCpuKernel>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + mul_functor); + } + if (pool_type == "MEAN") { + int* dst_count_data = ctx.template Alloc(dst_count); + memset(dst_count_data, 0, dst_count->dims()[0] * sizeof(int)); + for (int i = 0; i < index_size; i++) { + IndexT dst_idx = d_index[i]; + dst_count_data[dst_idx] += 1; + } + for (int i = 0; i < out_dims[0]; i++) { + if (dst_count_data[i] == 0) continue; + auto out_slice = out->Slice(i, i + 1); + auto eigen_out = phi::EigenVector::Flatten(out_slice); + eigen_out = eigen_out / static_cast(dst_count_data[i]); + } + } + } else if (pool_type == "MIN") { + GraphMinFunctor min_functor; + if (compute_type == "ADD") { + GraphAddFunctor add_functor; + GraphSendUERecvMinMaxCpuKernel, + GraphMinFunctor>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + add_functor, + min_functor); + } else if (compute_type == "MUL") { + GraphMulFunctor mul_functor; + GraphSendUERecvMinMaxCpuKernel, + GraphMinFunctor>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + mul_functor, + min_functor); + } + } else if (pool_type == "MAX") { + GraphMaxFunctor max_functor; + if (compute_type == "ADD") { + GraphAddFunctor add_functor; + GraphSendUERecvMinMaxCpuKernel, + GraphMaxFunctor>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + add_functor, + max_functor); + } else if (compute_type == "MUL") { + GraphMulFunctor mul_functor; + GraphSendUERecvMinMaxCpuKernel, + GraphMaxFunctor>(bcast_info, + x_data, + e_data, + s_index, + d_index, + out_data, + index_size, + mul_functor, + max_functor); + } + } +} + +template +void GraphSendUERecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + DenseTensor* out, + DenseTensor* dst_count) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendUERecvOpKernelLaunchHelper(ctx, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + out_size, + out, + dst_count); + } else if (index_type == phi::DataType::INT64) { + GraphSendUERecvOpKernelLaunchHelper(ctx, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + out_size, + out, + dst_count); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_ue_recv, + CPU, + ALL_LAYOUT, + phi::GraphSendUERecvKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h similarity index 93% rename from paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h rename to paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index 34d5f22d165b6..b035a52aece41 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -22,7 +22,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" +#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h" namespace phi { @@ -68,21 +68,21 @@ inline int FindNumThreads(int dim, int max_num_threads = CUDA_MAX_NUM_THREADS) { } template -struct GraphSendERecvSumCUDAFunctor { +struct GraphSendUERecvSumCUDAFunctor { DEVICE inline void operator()(T* output, T val) { paddle::platform::CudaAtomicAdd(output, val); } }; template -struct GraphSendERecvMaxCUDAFunctor { +struct GraphSendUERecvMaxCUDAFunctor { DEVICE inline void operator()(T* output, T val) { paddle::platform::CudaAtomicMax(output, val); } }; template -struct GraphSendERecvMinCUDAFunctor { +struct GraphSendUERecvMinCUDAFunctor { DEVICE inline void operator()(T* output, T val) { paddle::platform::CudaAtomicMin(output, val); } @@ -92,20 +92,20 @@ template -__global__ void GraphSendERecvCUDAKernel(const T* x_data, - const T* e_data, - const IndexT* src_indices, - const IndexT* dst_indices, - const int64_t* xbcast_off, - const int64_t* ebcast_off, - T* output, - int64_t index_size, - int64_t x_len, - int64_t e_len, - int64_t out_len, - bool use_bcast, - ComputeFunctor cfunctor, - ReduceFunctor rfunctor) { +__global__ void GraphSendUERecvCUDAKernel(const T* x_data, + const T* e_data, + const IndexT* src_indices, + const IndexT* dst_indices, + const int64_t* xbcast_off, + const int64_t* ebcast_off, + T* output, + int64_t index_size, + int64_t x_len, + int64_t e_len, + int64_t out_len, + bool use_bcast, + ComputeFunctor cfunctor, + ReduceFunctor rfunctor) { IndexT ty = blockIdx.y * blockDim.y + threadIdx.y; const IndexT stride_y = blockDim.y * gridDim.y; diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu similarity index 92% rename from paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu rename to paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu index 7782061c3f345..994f6b15b9675 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu @@ -12,16 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_e_recv_grad_kernel.h" +#include "paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h" #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" -#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" +#include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h" +#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h" #include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { @@ -187,12 +187,12 @@ void CalculateXGrad(const Context& ctx, const dim3 grid_(nbx, nby); const dim3 block_(ntx, nty); funcs::MultiplyFunctor mul_functor; - GraphSendERecvSumCUDAFunctor sum_functor; + GraphSendUERecvSumCUDAFunctor sum_functor; if (!reduce) { - GraphSendERecvCUDAKernel, - funcs::MultiplyFunctor> + GraphSendUERecvCUDAKernel, + funcs::MultiplyFunctor> <<>>( out_grad, e_data, @@ -213,10 +213,10 @@ void CalculateXGrad(const Context& ctx, phi::EmptyLike(ctx, out_grad_tensor); phi::funcs::SetConstant()(ctx, &x_grad_v2, T(0)); T* x_grad_v2_data = x_grad_v2.data(); - GraphSendERecvCUDAKernel, - funcs::MultiplyFunctor> + GraphSendUERecvCUDAKernel, + funcs::MultiplyFunctor> <<>>( out_grad, e_data, @@ -458,7 +458,7 @@ void CalculateEGrad(const Context& ctx, } template -void GraphSendERecvGradOpCUDAKernelLaunchHelper( +void GraphSendUERecvGradOpCUDAKernelLaunchHelper( const Context& ctx, const DenseTensor& out_grad, const DenseTensor& x, @@ -556,21 +556,21 @@ void GraphSendERecvGradOpCUDAKernelLaunchHelper( } template -void GraphSendERecvGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& e, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const paddle::optional& out, - const paddle::optional& dst_count, - const DenseTensor& out_grad, - const std::string& compute_type, - const std::string& pool_type, - DenseTensor* x_grad, - DenseTensor* e_grad) { +void GraphSendUERecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const paddle::optional& out, + const paddle::optional& dst_count, + const DenseTensor& out_grad, + const std::string& compute_type, + const std::string& pool_type, + DenseTensor* x_grad, + DenseTensor* e_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { - GraphSendERecvGradOpCUDAKernelLaunchHelper( + GraphSendUERecvGradOpCUDAKernelLaunchHelper( ctx, out_grad, x, @@ -584,7 +584,7 @@ void GraphSendERecvGradKernel(const Context& ctx, dst_count.get_ptr(), out.get_ptr()); } else if (index_type == phi::DataType::INT64) { - GraphSendERecvGradOpCUDAKernelLaunchHelper( + GraphSendUERecvGradOpCUDAKernelLaunchHelper( ctx, out_grad, x, @@ -602,10 +602,10 @@ void GraphSendERecvGradKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL(graph_send_e_recv_grad, +PD_REGISTER_KERNEL(graph_send_ue_recv_grad, GPU, ALL_LAYOUT, - phi::GraphSendERecvGradKernel, + phi::GraphSendUERecvGradKernel, float, double, int, diff --git a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu similarity index 75% rename from paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu rename to paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu index 7bf6523e60747..0d57097b8196e 100644 --- a/paddle/phi/kernels/gpu/graph_send_e_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu @@ -12,10 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/kernels/graph_send_e_recv_kernel.h" -#include "paddle/phi/kernels/gpu/graph_send_e_recv_funcs.h" +#include "paddle/phi/kernels/graph_send_ue_recv_kernel.h" #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" -#include "paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h" +#include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h" +#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h" #include #include @@ -30,16 +30,16 @@ namespace phi { template -void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, - const DenseTensor& x, - const DenseTensor& e, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, - int64_t out_size, - DenseTensor* out, - DenseTensor* dst_count = nullptr) { +void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + DenseTensor* out, + DenseTensor* dst_count = nullptr) { const int& index_size = src_index.dims()[0]; ctx.template Alloc(out); T* out_data = out->data(); @@ -96,13 +96,13 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, int block_ = 1024; #endif if (pool_type == "SUM" || pool_type == "MEAN") { - GraphSendERecvSumCUDAFunctor sum_functor; + GraphSendUERecvSumCUDAFunctor sum_functor; if (compute_type == "ADD") { funcs::AddFunctor add_funtor; - GraphSendERecvCUDAKernel, - funcs::AddFunctor> + GraphSendUERecvCUDAKernel, + funcs::AddFunctor> <<>>( x_data, e_data, @@ -120,10 +120,10 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, sum_functor); } else if (compute_type == "MUL") { funcs::MultiplyFunctor mul_functor; - GraphSendERecvCUDAKernel, - funcs::MultiplyFunctor> + GraphSendUERecvCUDAKernel, + funcs::MultiplyFunctor> <<>>( x_data, e_data, @@ -164,13 +164,13 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, out_data, dst_count_data, input_size, out_len); } } else if (pool_type == "MAX") { - GraphSendERecvMaxCUDAFunctor max_functor; + GraphSendUERecvMaxCUDAFunctor max_functor; if (compute_type == "ADD") { funcs::AddFunctor add_funtor; - GraphSendERecvCUDAKernel, - funcs::AddFunctor> + GraphSendUERecvCUDAKernel, + funcs::AddFunctor> <<>>( x_data, e_data, @@ -188,10 +188,10 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, max_functor); } else if (compute_type == "MUL") { funcs::MultiplyFunctor mul_functor; - GraphSendERecvCUDAKernel, - funcs::MultiplyFunctor> + GraphSendUERecvCUDAKernel, + funcs::MultiplyFunctor> <<>>( x_data, e_data, @@ -217,13 +217,13 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, InputResetMaxCUDAKernel <<>>(out_data, input_size, out_len); } else if (pool_type == "MIN") { - GraphSendERecvMinCUDAFunctor min_functor; + GraphSendUERecvMinCUDAFunctor min_functor; if (compute_type == "ADD") { funcs::AddFunctor add_funtor; - GraphSendERecvCUDAKernel, - funcs::AddFunctor> + GraphSendUERecvCUDAKernel, + funcs::AddFunctor> <<>>( x_data, e_data, @@ -241,10 +241,10 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, min_functor); } else if (compute_type == "MUL") { funcs::MultiplyFunctor mul_functor; - GraphSendERecvCUDAKernel, - funcs::MultiplyFunctor> + GraphSendUERecvCUDAKernel, + funcs::MultiplyFunctor> <<>>( x_data, e_data, @@ -273,48 +273,48 @@ void GraphSendERecvOpCUDAKernelLaunchHelper(const Context& ctx, } template -void GraphSendERecvKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& e, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, - int64_t out_size, - DenseTensor* out, - DenseTensor* dst_count) { +void GraphSendUERecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + DenseTensor* out, + DenseTensor* dst_count) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { - GraphSendERecvOpCUDAKernelLaunchHelper(ctx, - x, - e, - src_index, - dst_index, - compute_type, - pool_type, - out_size, - out, - dst_count); + GraphSendUERecvOpCUDAKernelLaunchHelper(ctx, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + out_size, + out, + dst_count); } else if (index_type == phi::DataType::INT64) { - GraphSendERecvOpCUDAKernelLaunchHelper(ctx, - x, - e, - src_index, - dst_index, - compute_type, - pool_type, - out_size, - out, - dst_count); + GraphSendUERecvOpCUDAKernelLaunchHelper(ctx, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + out_size, + out, + dst_count); } } } // namespace phi -PD_REGISTER_KERNEL(graph_send_e_recv, +PD_REGISTER_KERNEL(graph_send_ue_recv, GPU, ALL_LAYOUT, - phi::GraphSendERecvKernel, + phi::GraphSendUERecvKernel, float, double, int, diff --git a/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h similarity index 52% rename from paddle/phi/kernels/graph_send_e_recv_grad_kernel.h rename to paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h index cd9a9ee98c84b..b3f60944211ef 100644 --- a/paddle/phi/kernels/graph_send_e_recv_grad_kernel.h +++ b/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h @@ -21,16 +21,16 @@ namespace phi { template -void GraphSendERecvGradKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& e, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const paddle::optional& out, - const paddle::optional& dst_count, - const DenseTensor& out_grad, - const std::string& compute_type, - const std::string& pool_type, - DenseTensor* x_grad, - DenseTensor* e_grad); +void GraphSendUERecvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const paddle::optional& out, + const paddle::optional& dst_count, + const DenseTensor& out_grad, + const std::string& compute_type, + const std::string& pool_type, + DenseTensor* x_grad, + DenseTensor* e_grad); } // namespace phi diff --git a/paddle/phi/kernels/graph_send_e_recv_kernel.h b/paddle/phi/kernels/graph_send_ue_recv_kernel.h similarity index 60% rename from paddle/phi/kernels/graph_send_e_recv_kernel.h rename to paddle/phi/kernels/graph_send_ue_recv_kernel.h index f460ab7b3cbe3..c3b8ab42560ff 100644 --- a/paddle/phi/kernels/graph_send_e_recv_kernel.h +++ b/paddle/phi/kernels/graph_send_ue_recv_kernel.h @@ -20,15 +20,15 @@ namespace phi { template -void GraphSendERecvKernel(const Context& ctx, - const DenseTensor& x, - const DenseTensor& e, - const DenseTensor& src_index, - const DenseTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, - int64_t out_size, - DenseTensor* out, - DenseTensor* dst_count); +void GraphSendUERecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& e, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& compute_type, + const std::string& pool_type, + int64_t out_size, + DenseTensor* out, + DenseTensor* dst_count); } // namespace phi diff --git a/paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h similarity index 100% rename from paddle/phi/kernels/impl/graph_send_e_recv_kernel_impl.h rename to paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h diff --git a/paddle/phi/ops/compat/graph_send_e_recv_sig.cc b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc similarity index 72% rename from paddle/phi/ops/compat/graph_send_e_recv_sig.cc rename to paddle/phi/ops/compat/graph_send_ue_recv_sig.cc index a89708cf35736..6f2f5823e0f0e 100644 --- a/paddle/phi/ops/compat/graph_send_e_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc @@ -16,18 +16,18 @@ limitations under the License. */ namespace phi { -KernelSignature GraphSendERecvOpArgumentMapping( +KernelSignature GraphSendUERecvOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("graph_send_e_recv", + return KernelSignature("graph_send_ue_recv", {"X", "E", "Src_index", "Dst_index"}, {"compute_type", "pool_type", "out_size"}, {"Out", "Dst_count"}); } -KernelSignature GraphSendERecvGradOpArgumentMapping( +KernelSignature GraphSendUERecvGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( - "graph_send_e_recv_grad", + "graph_send_ue_recv_grad", {"X", "E", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, {"compute_type", "pool_type"}, {"X@GRAD", "E@GRAD"}); @@ -35,8 +35,8 @@ KernelSignature GraphSendERecvGradOpArgumentMapping( } // namespace phi -PD_REGISTER_ARG_MAPPING_FN(graph_send_e_recv, - phi::GraphSendERecvOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv, + phi::GraphSendUERecvOpArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(graph_send_e_recv_grad, - phi::GraphSendERecvGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(graph_send_ue_recv_grad, + phi::GraphSendUERecvGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 58fbdaf151370..16e4671143ed0 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -1534,7 +1534,7 @@ set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120) set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120) set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120) set_tests_properties(test_split_program PROPERTIES TIMEOUT 120) -set_tests_properties(test_graph_send_e_recv_op PROPERTIES TIMEOUT 60) +set_tests_properties(test_graph_send_ue_recv_op PROPERTIES TIMEOUT 60) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py similarity index 91% rename from python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py rename to python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py index c936ca20dbc99..e32bddc66cb11 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_e_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py @@ -96,7 +96,7 @@ def calculate_bcastinfo(self): self.rhs_offset = rhs_offset -def compute_graph_send_e_recv_for_sum(inputs, attributes): +def compute_graph_send_ue_recv_for_sum(inputs, attributes): x = inputs['X'] e = inputs['E'] src_index = inputs['Src_index'] @@ -119,7 +119,7 @@ def compute_graph_send_e_recv_for_sum(inputs, attributes): return results -def compute_graph_send_e_recv_for_mean(inputs, attributes): +def compute_graph_send_ue_recv_for_mean(inputs, attributes): x = inputs['X'] e = inputs['E'] src_index = inputs['Src_index'] @@ -148,7 +148,7 @@ def compute_graph_send_e_recv_for_mean(inputs, attributes): return results, count -def compute_graph_send_e_recv_for_max_min(inputs, attributes): +def compute_graph_send_ue_recv_for_max_min(inputs, attributes): x = inputs['X'] e = inputs['E'] src_index = inputs['Src_index'] @@ -248,11 +248,11 @@ def compute_graph_send_e_recv_for_max_min(inputs, attributes): return results, gradients -class TestGraphSendERecvSumOp(OpTest): +class TestGraphSendUERecvSumOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "graph_send_e_recv" + self.op_type = "graph_send_ue_recv" self.set_config() self.inputs = { 'X': self.x, @@ -262,7 +262,7 @@ def setUp(self): } self.attrs = {'compute_type': self.compute_type, 'pool_type': 'SUM'} - out = compute_graph_send_e_recv_for_sum(self.inputs, self.attrs) + out = compute_graph_send_ue_recv_for_sum(self.inputs, self.attrs) self.outputs = {'Out': out} @@ -281,7 +281,7 @@ def test_check_grad(self): self.check_grad(['X', 'E'], 'Out') -class TestSumCase1(TestGraphSendERecvSumOp): +class TestSumCase1(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -292,7 +292,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestSumCase2(TestGraphSendERecvSumOp): +class TestSumCase2(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -303,7 +303,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestSumCase3(TestGraphSendERecvSumOp): +class TestSumCase3(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -314,7 +314,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestSumCase4(TestGraphSendERecvSumOp): +class TestSumCase4(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") @@ -325,7 +325,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestSumCase5(TestGraphSendERecvSumOp): +class TestSumCase5(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") @@ -336,7 +336,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestSumCase6(TestGraphSendERecvSumOp): +class TestSumCase6(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") @@ -347,7 +347,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestSumCase7(TestGraphSendERecvSumOp): +class TestSumCase7(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") @@ -358,11 +358,11 @@ def set_config(self): self.compute_type = 'MUL' -class TestGraphSendERecvMeanOp(OpTest): +class TestGraphSendUERecvMeanOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "graph_send_e_recv" + self.op_type = "graph_send_ue_recv" self.set_config() self.inputs = { 'X': self.x, @@ -372,7 +372,7 @@ def setUp(self): } self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MEAN'} - out, dst_count = compute_graph_send_e_recv_for_mean( + out, dst_count = compute_graph_send_ue_recv_for_mean( self.inputs, self.attrs) self.outputs = {'Out': out, 'Dst_count': dst_count} @@ -392,7 +392,7 @@ def test_check_grad(self): self.check_grad(['X', 'E'], 'Out') -class TestMeanCase1(TestGraphSendERecvMeanOp): +class TestMeanCase1(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -403,7 +403,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestMeanCase2(TestGraphSendERecvMeanOp): +class TestMeanCase2(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -414,7 +414,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestMeanCase3(TestGraphSendERecvMeanOp): +class TestMeanCase3(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -425,7 +425,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestMeanCase4(TestGraphSendERecvMeanOp): +class TestMeanCase4(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") @@ -436,7 +436,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestMeanCase5(TestGraphSendERecvMeanOp): +class TestMeanCase5(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") @@ -447,7 +447,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestMeanCase6(TestGraphSendERecvMeanOp): +class TestMeanCase6(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") @@ -458,7 +458,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestMeanCase7(TestGraphSendERecvMeanOp): +class TestMeanCase7(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") @@ -469,11 +469,11 @@ def set_config(self): self.compute_type = 'MUL' -class TestGraphSendERecvMaxOp(OpTest): +class TestGraphSendUERecvMaxOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "graph_send_e_recv" + self.op_type = "graph_send_ue_recv" self.set_config() self.inputs = { 'X': self.x, @@ -483,7 +483,7 @@ def setUp(self): } self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MAX'} - out, self.gradients = compute_graph_send_e_recv_for_max_min( + out, self.gradients = compute_graph_send_ue_recv_for_max_min( self.inputs, self.attrs) self.outputs = {'Out': out} @@ -503,7 +503,7 @@ def test_check_grad(self): self.check_grad(['X', 'E'], 'Out', user_defined_grads=self.gradients) -class TestMaxCase1(TestGraphSendERecvMaxOp): +class TestMaxCase1(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -514,7 +514,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestMaxCase2(TestGraphSendERecvMaxOp): +class TestMaxCase2(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -525,7 +525,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestMaxCase3(TestGraphSendERecvMaxOp): +class TestMaxCase3(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -536,7 +536,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestMaxCase4(TestGraphSendERecvMaxOp): +class TestMaxCase4(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") @@ -547,7 +547,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestMaxCase5(TestGraphSendERecvMaxOp): +class TestMaxCase5(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") @@ -558,7 +558,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestMaxCase6(TestGraphSendERecvMaxOp): +class TestMaxCase6(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") @@ -569,7 +569,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestMaxCase7(TestGraphSendERecvMaxOp): +class TestMaxCase7(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") @@ -580,11 +580,11 @@ def set_config(self): self.compute_type = 'MUL' -class TestGraphSendERecvMinOp(OpTest): +class TestGraphSendUERecvMinOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "graph_send_e_recv" + self.op_type = "graph_send_ue_recv" self.set_config() self.inputs = { 'X': self.x, @@ -594,7 +594,7 @@ def setUp(self): } self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MIN'} - out, self.gradients = compute_graph_send_e_recv_for_max_min( + out, self.gradients = compute_graph_send_ue_recv_for_max_min( self.inputs, self.attrs) self.outputs = {'Out': out} @@ -614,7 +614,7 @@ def test_check_grad(self): self.check_grad(['X', 'E'], 'Out', user_defined_grads=self.gradients) -class TestMinCase1(TestGraphSendERecvMinOp): +class TestMinCase1(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -625,7 +625,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestMinCase2(TestGraphSendERecvMinOp): +class TestMinCase2(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -636,7 +636,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestMinCase3(TestGraphSendERecvMinOp): +class TestMinCase3(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") @@ -647,7 +647,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestMinCase4(TestGraphSendERecvMinOp): +class TestMinCase4(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") @@ -658,7 +658,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestMinCase5(TestGraphSendERecvMinOp): +class TestMinCase5(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") @@ -669,7 +669,7 @@ def set_config(self): self.compute_type = 'MUL' -class TestMinCase6(TestGraphSendERecvMinOp): +class TestMinCase6(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") @@ -680,7 +680,7 @@ def set_config(self): self.compute_type = 'ADD' -class TestMinCase7(TestGraphSendERecvMinOp): +class TestMinCase7(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") diff --git a/python/setup.py.in b/python/setup.py.in index c02ef7f017fca..6c71f07b8f5a7 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -400,6 +400,8 @@ packages=['paddle', 'paddle.device.cuda', 'paddle.version', 'paddle.profiler', + 'paddle.geometric', + 'paddle.geometric.message_passing', ] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: From 5f4e958384eb225bf1ce33bdacac8a169a003f55 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 25 Jul 2022 09:33:55 +0000 Subject: [PATCH 36/51] move out_size to IntArray --- .../fluid/operators/graph_send_ue_recv_op.cc | 14 ++-- paddle/fluid/pybind/op_function_generator.h | 2 + paddle/phi/api/yaml/legacy_api.yaml | 11 +++ paddle/phi/api/yaml/legacy_backward.yaml | 16 ++++- paddle/phi/infermeta/multiary.cc | 14 +--- paddle/phi/infermeta/multiary.h | 3 +- .../kernels/cpu/graph_send_ue_recv_kernel.cc | 35 ++++++--- .../kernels/gpu/graph_send_ue_recv_kernel.cu | 72 +++++++++++-------- .../phi/kernels/graph_send_ue_recv_kernel.h | 3 +- .../phi/ops/compat/graph_send_ue_recv_sig.cc | 15 ++-- 10 files changed, 123 insertions(+), 62 deletions(-) diff --git a/paddle/fluid/operators/graph_send_ue_recv_op.cc b/paddle/fluid/operators/graph_send_ue_recv_op.cc index 398ff80eb5b0b..f2a3a84dc50e7 100644 --- a/paddle/fluid/operators/graph_send_ue_recv_op.cc +++ b/paddle/fluid/operators/graph_send_ue_recv_op.cc @@ -62,6 +62,10 @@ class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker { "The input edge weight tensor, data type should be same with X"); AddInput("Src_index", "The source index tensor."); AddInput("Dst_index", "The destination index tensor."); + AddInput("OutSizeTensor", + "(Tensor, optional). The 0th dimension of the output." + "It has a higher priority than Attr(out_size).") + .AsDispensable(); AddOutput("Out", "Output tensor of graph_send_ue_recv op."); AddOutput("Dst_count", "Count tensor of Dst_index, mainly for MEAN pool_type.") @@ -77,14 +81,14 @@ class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker { "tensors of Dst_index.") .SetDefault("SUM") .InEnum({"SUM", "MEAN", "MIN", "MAX"}); - AddAttr( + AddAttr>( "out_size", - "(int64_t, default 0)" + "(vector, default {0})" "Define the first dimension of Output tensor." - "If set default 0, then the shape of Out is the same with X.") - .SetDefault(0); + "If set default {0}, then the shape of Out is the same with X.") + .SetDefault({0}); AddComment(R"DOC( -Graph Learning Send_E_Recv combine operator. +Graph Learning Send_UE_Recv combine operator. $Out = Recv(Compute(Send(X, Src_index), E, compute_type), Dst_index, pool_type)$ diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index 590d9d2f83e8b..e72fdc9c13a06 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -225,6 +225,8 @@ std::map> op_ins_map = { "Bias3", "Mean3", "Var3"}}, + {"graph_send_ue_recv", + {"X", "E", "Src_index", "Dst_index", "OutSizeTensor"}}, }; // NOTE(zhiqiu): Like op_ins_map. diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index 6408b90813025..d298ec780ece6 100644 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -906,6 +906,17 @@ intermediate : dst_count backward : graph_send_recv_grad +- api : graph_send_ue_recv + args : (Tensor x, Tensor e, Tensor src_index, Tensor dst_index, str compute_type, str pool_type, IntArray out_size) + output : Tensor(out), Tensor(dst_count) + infer_meta : + func : GraphSendUERecvInferMeta + kernel : + func : graph_send_ue_recv + data_type : x + intermediate : dst_count + backward : graph_send_ue_recv_grad + - api : greater_equal args : (Tensor x, Tensor y, int axis = -1) output : Tensor diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index ba8c78edd97ca..023c1b62334fa 100644 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -848,15 +848,27 @@ data_type : out_grad optional: out, dst_count +- backward_api : graph_send_ue_recv_grad + forward : graph_send_ue_recv (Tensor x, Tensor e, Tensor src_index, Tensor dst_index, str compute_type, str pool_type, IntArray out_size) -> Tensor(out), Tensor(dst_count) + args : (Tensor x, Tensor e, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str compute_type, str pool_type) + output : Tensor(x_grad), Tensor(e_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [x, e] + kernel : + func : graph_send_ue_recv_grad + data_type : out_grad + optional: out, dst_count + # grid sample - backward_api : grid_sample_grad forward : grid_sample (Tensor x, Tensor grid, str mode, str padding_mode, bool align_corners) -> Tensor(out) args : (Tensor x, Tensor grid, Tensor out_grad, str mode, str padding_mode, bool align_corners) output : Tensor(x_grad), Tensor(grid_grad) - infer_meta : + infer_meta : func : GeneralBinaryGradInferMeta param : [x, grid] - kernel : + kernel : func : grid_sample_grad data_type : x diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 821fc7e0a60ea..1c2f3e7649211 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2355,7 +2355,7 @@ void GraphSendUERecvInferMeta(const MetaTensor& x, const MetaTensor& dst_index, const std::string& compute_type, const std::string& pool_type, - int64_t out_size, + const IntArray& out_size, MetaTensor* out, MetaTensor* dst_count) { auto src_index_dims = src_index.dims(); @@ -2409,11 +2409,7 @@ void GraphSendUERecvInferMeta(const MetaTensor& x, auto x_dims = x.dims(); if (pool_type == "MEAN") { - if (out_size <= 0) { - dst_count->set_dims({x_dims[0]}); - } else { - dst_count->set_dims({out_size}); - } + dst_count->set_dims({x_dims[0]}); dst_count->set_dtype(DataType::INT32); } @@ -2437,11 +2433,7 @@ void GraphSendUERecvInferMeta(const MetaTensor& x, out_dims_array.data(), max_dim, axis); - if (out_size <= 0) { - out_dims_array.insert(out_dims_array.begin(), x_dims[0]); - } else { - out_dims_array.insert(out_dims_array.begin(), out_size); - } + out_dims_array.insert(out_dims_array.begin(), x_dims[0]); out->set_dims(phi::make_ddim(out_dims_array)); } diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 724b28230ee40..d856753be5d19 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/meta_tensor.h" namespace phi { @@ -409,7 +410,7 @@ void GraphSendUERecvInferMeta(const MetaTensor& x, const MetaTensor& dst_index, const std::string& compute_type, const std::string& pool_type, - int64_t out_size, + const IntArray& out_size, MetaTensor* out, MetaTensor* dst_count); diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc index b78ff7fa7763e..64fc90a68c7fa 100644 --- a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc @@ -116,13 +116,27 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, DenseTensor* out, DenseTensor* dst_count = nullptr) { const int& index_size = src_index.dims()[0]; - ctx.template Alloc(out); - T* out_data = out->data(); auto out_dims = out->dims(); int64_t memset_size = 1; - for (int i = 0; i < out_dims.size(); i++) { - memset_size *= out_dims[i]; + if (out_size <= 0) { + for (int i = 0; i < out_dims.size(); i++) { + memset_size *= out_dims[i]; + } + } else { + // set out dim following out_size. + std::vector dims_ = phi::vectorize(out_dims); + if (dims_.size() > 0) { + dims_[0] = out_size; + } + out->Resize(phi::make_ddim(dims_)); + memset_size = out_size; + for (int i = 1; i < out_dims.size(); ++i) { + memset_size *= out_dims[i]; + } } + + ctx.template Alloc(out); + T* out_data = out->data(); const size_t& memset_bytes = memset_size * sizeof(T); memset(out_data, 0, memset_bytes); @@ -155,13 +169,15 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, mul_functor); } if (pool_type == "MEAN") { + int64_t input_size = out_size <= 0 ? x.dims()[0] : out_size; + dst_count->Resize({input_size}); int* dst_count_data = ctx.template Alloc(dst_count); - memset(dst_count_data, 0, dst_count->dims()[0] * sizeof(int)); + memset(dst_count_data, 0, input_size * sizeof(int)); for (int i = 0; i < index_size; i++) { IndexT dst_idx = d_index[i]; dst_count_data[dst_idx] += 1; } - for (int i = 0; i < out_dims[0]; i++) { + for (int i = 0; i < input_size; i++) { if (dst_count_data[i] == 0) continue; auto out_slice = out->Slice(i, i + 1); auto eigen_out = phi::EigenVector::Flatten(out_slice); @@ -241,10 +257,11 @@ void GraphSendUERecvKernel(const Context& ctx, const DenseTensor& dst_index, const std::string& compute_type, const std::string& pool_type, - int64_t out_size, + const IntArray& out_size, DenseTensor* out, DenseTensor* dst_count) { auto index_type = src_index.dtype(); + auto& out_size_data = out_size.GetData(); if (index_type == phi::DataType::INT32) { GraphSendUERecvOpKernelLaunchHelper(ctx, x, @@ -253,7 +270,7 @@ void GraphSendUERecvKernel(const Context& ctx, dst_index, compute_type, pool_type, - out_size, + out_size_data[0], out, dst_count); } else if (index_type == phi::DataType::INT64) { @@ -264,7 +281,7 @@ void GraphSendUERecvKernel(const Context& ctx, dst_index, compute_type, pool_type, - out_size, + out_size_data[0], out, dst_count); } diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu index 0d57097b8196e..f657fd7c03c20 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu @@ -41,13 +41,25 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, DenseTensor* out, DenseTensor* dst_count = nullptr) { const int& index_size = src_index.dims()[0]; - ctx.template Alloc(out); - T* out_data = out->data(); auto out_dims = out->dims(); int64_t memset_size = 1; - for (int i = 0; i < out_dims.size(); i++) { - memset_size *= out_dims[i]; + if (out_size <= 0) { + for (int i = 0; i < out_dims.size(); i++) { + memset_size *= out_dims[i]; + } + } else { + std::vector dims_ = phi::vectorize(out_dims); + if (dims_.size() > 0) { + dims_[0] = out_size; + } + out->Resize(phi::make_ddim(dims_)); + memset_size = out_size; + for (int i = 1; i < out_dims.size(); ++i) { + memset_size *= out_dims[i]; + } } + ctx.template Alloc(out); + T* out_data = out->data(); const size_t& memset_bytes = memset_size * sizeof(T); if (pool_type == "SUM" || pool_type == "MEAN") { #ifdef PADDLE_WITH_HIP @@ -141,11 +153,10 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, sum_functor); } if (pool_type == "MEAN") { + input_size = out_size <= 0 ? x.dims()[0] : out_size; + dst_count->Resize({input_size}); ctx.template Alloc(dst_count); - int32_t* dst_count_data = dst_count->data(); - if (out_size > 0) { - input_size = out_size; - } + int* dst_count_data = dst_count->data(); #ifdef PADDLE_WITH_HIP hipMemset(dst_count_data, 0, input_size * sizeof(int)); #else @@ -280,32 +291,35 @@ void GraphSendUERecvKernel(const Context& ctx, const DenseTensor& dst_index, const std::string& compute_type, const std::string& pool_type, - int64_t out_size, + const IntArray& out_size, DenseTensor* out, DenseTensor* dst_count) { auto index_type = src_index.dtype(); + auto& out_size_data = out_size.GetData(); if (index_type == phi::DataType::INT32) { - GraphSendUERecvOpCUDAKernelLaunchHelper(ctx, - x, - e, - src_index, - dst_index, - compute_type, - pool_type, - out_size, - out, - dst_count); + GraphSendUERecvOpCUDAKernelLaunchHelper( + ctx, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + out_size_data[0], + out, + dst_count); } else if (index_type == phi::DataType::INT64) { - GraphSendUERecvOpCUDAKernelLaunchHelper(ctx, - x, - e, - src_index, - dst_index, - compute_type, - pool_type, - out_size, - out, - dst_count); + GraphSendUERecvOpCUDAKernelLaunchHelper( + ctx, + x, + e, + src_index, + dst_index, + compute_type, + pool_type, + out_size_data[0], + out, + dst_count); } } diff --git a/paddle/phi/kernels/graph_send_ue_recv_kernel.h b/paddle/phi/kernels/graph_send_ue_recv_kernel.h index c3b8ab42560ff..4cec8c85d2a95 100644 --- a/paddle/phi/kernels/graph_send_ue_recv_kernel.h +++ b/paddle/phi/kernels/graph_send_ue_recv_kernel.h @@ -15,6 +15,7 @@ #pragma once #include +#include "paddle/phi/common/int_array.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { @@ -27,7 +28,7 @@ void GraphSendUERecvKernel(const Context& ctx, const DenseTensor& dst_index, const std::string& compute_type, const std::string& pool_type, - int64_t out_size, + const IntArray& out_size, DenseTensor* out, DenseTensor* dst_count); diff --git a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc index 6f2f5823e0f0e..7270473c1fb6b 100644 --- a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc @@ -18,10 +18,17 @@ namespace phi { KernelSignature GraphSendUERecvOpArgumentMapping( const ArgumentMappingContext& ctx) { - return KernelSignature("graph_send_ue_recv", - {"X", "E", "Src_index", "Dst_index"}, - {"compute_type", "pool_type", "out_size"}, - {"Out", "Dst_count"}); + if (ctx.HasInput("OutSizeTensor")) { + return KernelSignature("graph_send_ue_recv", + {"X", "E", "Src_index", "Dst_index"}, + {"compute_type", "pool_type", "OutSizeTensor"}, + {"Out", "Dst_count"}); + } else { + return KernelSignature("graph_send_ue_recv", + {"X", "E", "Src_index", "Dst_index"}, + {"compute_type", "pool_type", "out_size"}, + {"Out", "Dst_count"}); + } } KernelSignature GraphSendUERecvGradOpArgumentMapping( From 100f853349409db2d2b54d8e4b0279ef392d8173 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 25 Jul 2022 12:27:28 +0000 Subject: [PATCH 37/51] add eager op test --- python/paddle/__init__.py | 1 + .../unittests/test_graph_send_ue_recv_op.py | 43 +++- python/paddle/geometric/__init__.py | 19 ++ .../geometric/message_passing/__init__.py | 15 ++ .../geometric/message_passing/send_ue_recv.py | 189 ++++++++++++++++++ .../paddle/geometric/message_passing/utils.py | 85 ++++++++ 6 files changed, 344 insertions(+), 8 deletions(-) create mode 100644 python/paddle/geometric/__init__.py create mode 100644 python/paddle/geometric/message_passing/__init__.py create mode 100644 python/paddle/geometric/message_passing/send_ue_recv.py create mode 100644 python/paddle/geometric/message_passing/utils.py diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 6e47f4f9eab43..a518fe4c84db7 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -78,6 +78,7 @@ import paddle.reader # noqa: F401 import paddle.static # noqa: F401 import paddle.vision # noqa: F401 +import paddle.geometric # noqa: F401 from .tensor.attribute import is_complex # noqa: F401 from .tensor.attribute import is_integer # noqa: F401 diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py index e32bddc66cb11..e977514b18bc1 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py @@ -248,10 +248,25 @@ def compute_graph_send_ue_recv_for_max_min(inputs, attributes): return results, gradients +def graph_send_ue_recv_wrapper(x, + y, + src_index, + dst_index, + compute_type="add", + pool_type="sum", + out_size=None, + name=None): + return paddle.geometric.send_ue_recv(x, y, src_index, dst_index, + compute_type.lower(), + pool_type.lower(), out_size, name) + + class TestGraphSendUERecvSumOp(OpTest): def setUp(self): paddle.enable_static() + self.python_api = graph_send_ue_recv_wrapper + self.python_out_sig = ["Out"] self.op_type = "graph_send_ue_recv" self.set_config() self.inputs = { @@ -275,10 +290,10 @@ def set_config(self): self.compute_type = 'ADD' def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'E'], 'Out') + self.check_grad(['X', 'E'], 'Out', check_eager=True) class TestSumCase1(TestGraphSendUERecvSumOp): @@ -362,6 +377,8 @@ class TestGraphSendUERecvMeanOp(OpTest): def setUp(self): paddle.enable_static() + self.python_api = graph_send_ue_recv_wrapper + self.python_out_sig = ["Out"] self.op_type = "graph_send_ue_recv" self.set_config() self.inputs = { @@ -386,10 +403,10 @@ def set_config(self): self.compute_type = 'ADD' def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'E'], 'Out') + self.check_grad(['X', 'E'], 'Out', check_eager=True) class TestMeanCase1(TestGraphSendUERecvMeanOp): @@ -473,6 +490,8 @@ class TestGraphSendUERecvMaxOp(OpTest): def setUp(self): paddle.enable_static() + self.python_api = graph_send_ue_recv_wrapper + self.python_out_sig = ["Out"] self.op_type = "graph_send_ue_recv" self.set_config() self.inputs = { @@ -497,10 +516,13 @@ def set_config(self): self.compute_type = 'ADD' def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'E'], 'Out', user_defined_grads=self.gradients) + self.check_grad(['X', 'E'], + 'Out', + user_defined_grads=self.gradients, + check_eager=True) class TestMaxCase1(TestGraphSendUERecvMaxOp): @@ -584,6 +606,8 @@ class TestGraphSendUERecvMinOp(OpTest): def setUp(self): paddle.enable_static() + self.python_api = graph_send_ue_recv_wrapper + self.python_out_sig = ["Out"] self.op_type = "graph_send_ue_recv" self.set_config() self.inputs = { @@ -608,10 +632,13 @@ def set_config(self): self.compute_type = 'ADD' def test_check_output(self): - self.check_output() + self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'E'], 'Out', user_defined_grads=self.gradients) + self.check_grad(['X', 'E'], + 'Out', + user_defined_grads=self.gradients, + check_eager=True) class TestMinCase1(TestGraphSendUERecvMinOp): diff --git a/python/paddle/geometric/__init__.py b/python/paddle/geometric/__init__.py new file mode 100644 index 0000000000000..85ccadfbaaf60 --- /dev/null +++ b/python/paddle/geometric/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .message_passing import send_ue_recv # noqa: F401 + +__all__ = [ + 'send_ue_recv', +] diff --git a/python/paddle/geometric/message_passing/__init__.py b/python/paddle/geometric/message_passing/__init__.py new file mode 100644 index 0000000000000..b2a021009c2ad --- /dev/null +++ b/python/paddle/geometric/message_passing/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .send_ue_recv import send_ue_recv # noqa: F401 diff --git a/python/paddle/geometric/message_passing/send_ue_recv.py b/python/paddle/geometric/message_passing/send_ue_recv.py new file mode 100644 index 0000000000000..0da380e17dc85 --- /dev/null +++ b/python/paddle/geometric/message_passing/send_ue_recv.py @@ -0,0 +1,189 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode +from paddle.fluid.framework import Variable +from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype +from paddle import _C_ops + +from .utils import convert_out_size_to_list, get_out_size_tensor_inputs, reshape_lhs_rhs + + +def send_ue_recv(x, + y, + src_index, + dst_index, + compute_type="add", + pool_type="sum", + out_size=None, + name=None): + """ + + Graph Learning message passing api. + + This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory + consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` + to gather the corresponding data, after computing with `y`, then use `dst_index` to update the corresponding position + of output tensor in different pooling types, like sum, mean, max, or min. Besides, we can use `out_size` to set + necessary output shape. + + .. code-block:: text + + Given: + + X = [[0, 2, 3], + [1, 4, 5], + [2, 6, 7]] + + E = [1, 1, 1] + + src_index = [0, 1, 2, 0] + dst_index = [1, 2, 1, 0] + pool_type = "sum" + out_size = None + + Then: + + Out = [[0, 2, 3], + [2, 8, 10], + [1, 4, 5]] + Args: + x (Tensor): The input node feature tensor, and the available data type is float32, float64, int32, int64. + e (Tensor): The input edge feature tensor, and the available data type is float32, float64, int32, int64. + src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. + dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. + The available data type is int32, int64. + compute_type (str): Different compute types for x and e, including `add`, `sub`, `mul`, `div`. + Default value is `add`. + pool_type (str): Different pooling types, including `sum`, `mean`, `max`, `min`. + Default value is `sum`. + out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or + out_size is smaller or equal to 0, then this input will not be used. + Otherwise, `out_size` should be equal with or larger than + max(dst_index) + 1. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`. + If `out_size` is set correctly, then it should have the same shape as `x` except + the 0th dimension. + + Examples: + + .. code-block:: python + + import paddle + + x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + e = paddle.to_tensor([1, 1, 1], type="float32") + indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32") + src_index = indexes[:, 0] + dst_index = indexes[:, 1] + out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum") + # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]] + + x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") + src_index = indexes[:, 0] + dst_index = indexes[:, 1] + out_size = paddle.max(dst_index) + 1 + out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum", out_size=out_size) + # Outputs: [[0., 2., 3.], [[2., 8., 10.]]] + + x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") + src_index = indexes[:, 0] + dst_index = indexes[:, 1] + out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum") + # Outputs: [[0., 2., 3.], [2., 8., 10.], [0., 0., 0.]] + + """ + + if compute_type not in ["add", "sub", "mul", "div"]: + raise ValueError( + "compute_type should be `add`, `sub`, `mul`, `div`, but received %s" + % compute_type) + + if pool_type not in ["sum", "mean", "max", "min"]: + raise ValueError( + "pool_type should be `sum`, `mean`, `max` or `min`, but received %s" + % pool_type) + + x, y = reshape_lhs_rhs(x, y) + + if compute_type == 'sub': + compute_type = 'add' + y = -y + if compute_type == "div": + compute_type = 'mul' + y = 1. / y + + # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1. + + if _in_legacy_dygraph(): + out_size = convert_out_size_to_list(out_size) + out, tmp = _C_ops.graph_send_ue_recv(x, y, src_index, dst_index, + None, 'compute_type', + compute_type.upper(), 'pool_type', + pool_type.upper(), 'out_size', + out_size) + return out + if in_dygraph_mode(): + out_size = convert_out_size_to_list(out_size) + return _C_ops.final_state_graph_send_ue_recv(x, y, src_index, dst_index, + compute_type.upper(), + pool_type.upper(), + out_size) + + check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"), + "send_ue_recv") + check_variable_and_dtype(y, "E", ("float32", "float64", "int32", "int64"), + "send_ue_recv") + check_variable_and_dtype(src_index, "Src_index", ("int32", "int64"), + "send_ue_recv") + check_variable_and_dtype(dst_index, "Dst_index", ("int32", "int64"), + "send_ue_recv") + if out_size: + check_type(out_size, 'out_size', (int, np.int32, np.int64, Variable), + 'send_ue_recv') + if isinstance(out_size, Variable): + check_dtype(out_size.dtype, 'out_size', ['int32', 'int64'], + 'send_ue_recv') + + helper = LayerHelper("send_ue_recv", **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + dst_count = helper.create_variable_for_type_inference(dtype="int32", + stop_gradient=True) + + inputs = {"X": x, "E": y, "Src_index": src_index, "Dst_index": dst_index} + attrs = { + "compute_type": compute_type.upper(), + "pool_type": pool_type.upper() + } + get_out_size_tensor_inputs(inputs=inputs, + attrs=attrs, + out_size=out_size, + op_type='send_ue_recv') + + helper.append_op(type="graph_send_ue_recv", + inputs=inputs, + outputs={ + "Out": out, + "Dst_count": dst_count + }, + attrs=attrs) + return out diff --git a/python/paddle/geometric/message_passing/utils.py b/python/paddle/geometric/message_passing/utils.py new file mode 100644 index 0000000000000..27b201b23235b --- /dev/null +++ b/python/paddle/geometric/message_passing/utils.py @@ -0,0 +1,85 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from paddle.fluid.framework import Variable +from paddle.fluid.data_feeder import check_dtype, convert_dtype +from paddle.fluid.layers.tensor import cast + + +def convert_out_size_to_list(out_size): + """ + Convert out_size(int, np.int32, np.int64, Variable) to list + in imperative mode. + """ + if out_size is None: + out_size = [0] + elif isinstance(out_size, (int, np.int32, np.int64)): + out_size = [out_size] + else: + out_size = [out_size.numpy().astype(int)[0]] + return out_size + + +def get_out_size_tensor_inputs(inputs, attrs, out_size, op_type): + """ + Convert out_size(int, np.int32, np.int64, Variable) to inputs + and attrs in static mode. + """ + if out_size is None: + attrs['out_size'] = [0] + elif isinstance(out_size, (int, np.int32, np.int64)): + attrs['out_size'] = [out_size] + elif isinstance(out_size, Variable): + out_size.stop_gradient = True + check_dtype(out_size.dtype, 'out_size', ['int32', 'int64'], 'op_type', + '(When type of out_size in' + op_type + ' is Variable.)') + if (convert_dtype(out_size.dtype) == 'int64'): + out_size = cast(out_size, 'int32') + inputs["OutSizeTensor"] = out_size + else: + raise TypeError("Out_size only supports Variable or int.") + + +def reshape_lhs_rhs(x, y): + """ + Expand dims to ensure there will be no broadcasting issues with different + number of dimensions. + """ + if len(x.shape) == 1: + x = paddle.reshape(x, [-1, 1]) + if len(y.shape) == 1: + y = paddle.reshape(y, [-1, 1]) + + x_shape = paddle.shape(x) + y_shape = paddle.shape(y) + if len(x_shape) != len(y_shape): + max_ndims = max(len(x_shape), len(y_shape)) + x_pad_ndims = max_ndims - len(x_shape) + y_pad_ndims = max_ndims - len(y_shape) + new_x_shape = [ + x_shape[0], + ] + [ + 1, + ] * x_pad_ndims + list(x_shape[1:]) + new_y_shape = [ + y_shape[0], + ] + [ + 1, + ] * y_pad_ndims + list(y_shape[1:]) + x = paddle.reshape(x, new_x_shape) + y = paddle.reshape(y, new_y_shape) + + return x, y From 8b5aed923a296266e9f0268d969339f25c9968be Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 26 Jul 2022 06:23:09 +0000 Subject: [PATCH 38/51] fix max pool type bug, add unittest for api --- .../phi/kernels/gpu/graph_send_recv_funcs.h | 2 +- .../phi/kernels/gpu/graph_send_recv_kernel.cu | 2 +- .../kernels/gpu/graph_send_ue_recv_kernel.cu | 3 +- .../unittests/test_graph_send_ue_recv_op.py | 163 ++++++++++++++++++ .../paddle/geometric/message_passing/utils.py | 8 +- 5 files changed, 171 insertions(+), 7 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h index a93603ae18f1c..4be92ae18629c 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h @@ -81,7 +81,7 @@ __global__ void InputResetMaxCUDAKernel(T* output, size_t input_size, size_t slice_size) { CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::min()) { + if (*(output + i) == std::numeric_limits::lowest()) { *(output + i) = 0; } } diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu index 7ecf352ffe996..ab5a8ffbb48ea 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu @@ -63,7 +63,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::min()); + std::numeric_limits::lowest()); } else if (pool_type == "MIN") { thrust::device_ptr p_output_ptr(p_output); thrust::fill(thrust::device, diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu index f657fd7c03c20..41ec8169e790b 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu @@ -72,7 +72,8 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, thrust::fill(thrust::device, out_data_ptr, out_data_ptr + memset_size, - std::numeric_limits::min()); + std::numeric_limits::lowest()); + } else if (pool_type == "MIN") { thrust::device_ptr out_data_ptr(out_data); thrust::fill(thrust::device, diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py index e977514b18bc1..9b779b152bb99 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py @@ -716,3 +716,166 @@ def set_config(self): self.src_index = index[:, 0] self.dst_index = index[:, 1] self.compute_type = 'MUL' + + +class API_GeometricSendUERecvTest(unittest.TestCase): + + def test_compute_all_with_sum(self): + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), + dtype="float32") + e = paddle.ones(shape=[4, 1], dtype="float32") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + + res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "add", "sum") + res_sub = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "sub", "sum") + res_mul = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "mul", "sum") + res_div = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "div", "sum") + res = [res_add, res_sub, res_mul, res_div] + + np_add = np.array([[1, 3, 4], [4, 10, 12], [2, 5, 6]], dtype="float32") + np_sub = np.array([[-1, 1, 2], [0, 6, 8], [0, 3, 4]], dtype="float32") + np_mul = np.array([[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32") + np_div = np.array([[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32") + + for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div], res): + self.assertTrue( + np.allclose(np_res, paddle_res, atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_res, paddle_res)) + + def test_compute_all_with_mean(self): + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), + dtype="float32") + e = paddle.ones(shape=[4, 1], dtype="float32") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + + res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "add", "mean") + res_sub = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "sub", "mean") + res_mul = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "mul", "mean") + res_div = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "div", "mean") + res = [res_add, res_sub, res_mul, res_div] + + np_add = np.array([[1, 3, 4], [2, 5, 6], [2, 5, 6]], dtype="float32") + np_sub = np.array([[-1, 1, 2], [0, 3, 4], [0, 3, 4]], dtype="float32") + np_mul = np.array([[0, 2, 3], [1, 4, 5], [1, 4, 5]], dtype="float32") + np_div = np.array([[0, 2, 3], [1, 4, 5], [1, 4, 5]], dtype="float32") + + for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div], res): + self.assertTrue( + np.allclose(np_res, paddle_res, atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_res, paddle_res)) + + def test_compute_all_with_max(self): + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), + dtype="float32") + e = paddle.ones(shape=[4, 1], dtype="float32") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + + res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "add", "max") + res_sub = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "sub", "max") + res_mul = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "mul", "max") + res_div = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "div", "max") + res = [res_add, res_sub, res_mul, res_div] + + np_add = np.array([[1, 3, 4], [3, 7, 8], [2, 5, 6]], dtype="float32") + np_sub = np.array([[-1, 1, 2], [1, 5, 6], [0, 3, 4]], dtype="float32") + np_mul = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float32") + np_div = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float32") + + self.assertTrue(np.allclose(np_sub, res_sub, atol=1e-6)) + for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div], res): + self.assertTrue( + np.allclose(np_res, paddle_res, atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_res, paddle_res)) + + def test_compute_all_with_min(self): + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), + dtype="float32") + e = paddle.ones(shape=[4, 1], dtype="float32") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + + res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "add", "min") + res_sub = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "sub", "min") + res_mul = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "mul", "min") + res_div = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "div", "min") + res = [res_add, res_sub, res_mul, res_div] + + np_add = np.array([[1, 3, 4], [1, 3, 4], [2, 5, 6]], dtype="float32") + np_sub = np.array([[-1, 1, 2], [-1, 1, 2], [0, 3, 4]], dtype="float32") + np_mul = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32") + np_div = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32") + + self.assertTrue(np.allclose(np_sub, res_sub, atol=1e-6)) + for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div], res): + self.assertTrue( + np.allclose(np_res, paddle_res, atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_res, paddle_res)) + + def test_out_size_tensor_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name="x", shape=[3, 3], dtype="float32") + e = paddle.static.data(name="e", shape=[3], dtype="float32") + src_index = paddle.static.data(name="src", shape=[3], dtype="int32") + dst_index = paddle.static.data(name="dst", shape=[3], dtype="int32") + out_size = paddle.static.data(name="out_size", + shape=[1], + dtype="int32") + + res_sum = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "mul", "sum", out_size) + + exe = paddle.static.Executor(paddle.CPUPlace()) + data1 = np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]], dtype="float32") + data2 = np.array([1, 2, 3], dtype="float32") + data3 = np.array([0, 0, 1], dtype="int32") + data4 = np.array([0, 1, 1], dtype="int32") + data5 = np.array([2], dtype="int32") + + np_sum = np.array([[0, 2, 3], [3, 16, 21]], dtype="float32") + + ret = exe.run(feed={ + 'x': data1, + 'e': data2, + 'src': data3, + 'dst': data4, + 'out_size': data5, + }, + fetch_list=[res_sum]) + self.assertTrue( + np.allclose(np_sum, ret[0], atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_sum, ret[0])) + + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_compute_all_with_sum() + self.test_compute_all_with_mean() + self.test_compute_all_with_max() + self.test_compute_all_with_min() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/geometric/message_passing/utils.py b/python/paddle/geometric/message_passing/utils.py index 27b201b23235b..9ec807dbb6fe5 100644 --- a/python/paddle/geometric/message_passing/utils.py +++ b/python/paddle/geometric/message_passing/utils.py @@ -65,10 +65,10 @@ def reshape_lhs_rhs(x, y): x_shape = paddle.shape(x) y_shape = paddle.shape(y) - if len(x_shape) != len(y_shape): - max_ndims = max(len(x_shape), len(y_shape)) - x_pad_ndims = max_ndims - len(x_shape) - y_pad_ndims = max_ndims - len(y_shape) + if len(x.shape) != len(y.shape): + max_ndims = max(len(x.shape), len(y.shape)) + x_pad_ndims = max_ndims - len(x.shape) + y_pad_ndims = max_ndims - len(y.shape) new_x_shape = [ x_shape[0], ] + [ From 8a5057d63b09c1fd8c68317ec5cd1bd5ea0ce22b Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 26 Jul 2022 06:46:51 +0000 Subject: [PATCH 39/51] revise api doc --- .../geometric/message_passing/send_ue_recv.py | 33 +++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/python/paddle/geometric/message_passing/send_ue_recv.py b/python/paddle/geometric/message_passing/send_ue_recv.py index 0da380e17dc85..94f450a79d4db 100644 --- a/python/paddle/geometric/message_passing/send_ue_recv.py +++ b/python/paddle/geometric/message_passing/send_ue_recv.py @@ -36,9 +36,9 @@ def send_ue_recv(x, This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` - to gather the corresponding data, after computing with `y`, then use `dst_index` to update the corresponding position - of output tensor in different pooling types, like sum, mean, max, or min. Besides, we can use `out_size` to set - necessary output shape. + to gather the corresponding data, after computing with `y` in different compute types, then use `dst_index` to + update the corresponding position of output tensor in different pooling types, like sum, mean, max, or min. + Besides, we can use `out_size` to set necessary output shape. .. code-block:: text @@ -51,15 +51,20 @@ def send_ue_recv(x, E = [1, 1, 1] src_index = [0, 1, 2, 0] + dst_index = [1, 2, 1, 0] + + compute_type = "add" + pool_type = "sum" + out_size = None Then: - Out = [[0, 2, 3], - [2, 8, 10], - [1, 4, 5]] + Out = [[1, 3, 4], + [4, 10, 12], + [2, 5, 6]] Args: x (Tensor): The input node feature tensor, and the available data type is float32, float64, int32, int64. e (Tensor): The input edge feature tensor, and the available data type is float32, float64, int32, int64. @@ -89,27 +94,29 @@ def send_ue_recv(x, import paddle x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") - e = paddle.to_tensor([1, 1, 1], type="float32") + e = paddle.to_tensor([1, 1, 1, 1], dtype="float32") indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] - out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum") - # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]] + out = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, compute_type="add", pool_type="sum") + # Outputs: [[1., 3., 4.], [4., 10., 12.], [2., 5., 6.]] x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + e = paddle.to_tensor([1, 1, 1], dtype="float32") indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] out_size = paddle.max(dst_index) + 1 - out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum", out_size=out_size) - # Outputs: [[0., 2., 3.], [[2., 8., 10.]]] + out = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, compute_type="add", pool_type="sum", out_size=out_size) + # Outputs: [[1., 3., 4.], [[4., 10., 12.]]] x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + e = paddle.to_tensor([1, 1, 1], dtype="float32") indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] - out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum") - # Outputs: [[0., 2., 3.], [2., 8., 10.], [0., 0., 0.]] + out = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, compute_type="add", pool_type="sum") + # Outputs: [[1., 3., 4.], [4., 10., 12.], [0., 0., 0.]] """ From dab0ccc4d6114f244854a02737465ebd2968ceb9 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Fri, 29 Jul 2022 08:02:10 +0000 Subject: [PATCH 40/51] add fp16 for atomic min and max, add unittest --- .../platform/device/gpu/gpu_primitives.h | 155 ++++++++++++++++++ .../phi/kernels/gpu/graph_send_recv_funcs.h | 6 +- .../kernels/gpu/graph_send_ue_recv_funcs.h | 25 +-- .../gpu/graph_send_ue_recv_grad_kernel.cu | 3 +- .../kernels/gpu/graph_send_ue_recv_kernel.cu | 3 +- .../unittests/test_graph_send_ue_recv_op.py | 73 ++++++++- 6 files changed, 249 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index 2ebaee610f3bd..6edec300b44a7 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -419,6 +419,112 @@ CUDA_ATOMIC_WRAPPER(Max, double) { return __longlong_as_double(old); } +#ifdef PADDLE_CUDA_FP16 +template +struct Cast { + // unused. + static __device__ __forceinline__ unsigned int Encode(T val) { + return static_cast(val); + } + static __device__ __forceinline__ T Decode(unsigned int code) { + return static_cast(code); + } +}; + +template <> +struct Cast { + static __device__ __forceinline__ unsigned short int Encode( // NOLINT + half val) { + return __half_as_ushort(val); + } + static __device__ __forceinline__ half + Decode(unsigned short int code) { // NOLINT + return __ushort_as_half(code); + } +}; + +static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT + unsigned short int *address, // NOLINT + unsigned short int compare, // NOLINT + unsigned short int val) { // NOLINT + static_assert(CUDA_VERSION >= 10000, "Requires at least CUDA 10"); +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700) + return atomicCAS(address, compare, val); +#else + (void)address; + (void)compare; + (void)val; + printf( + "Atomic operations are not supported for half precision (FP16) " + "on this GPU.\n"); + __trap(); + return val; +#endif +} + +inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) { + float16 low_half; + // the float16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = static_cast(max(static_cast(low_half), x)); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t max_to_high_half(uint32_t val, float x) { + float16 high_half; + // the float16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = static_cast(max(static_cast(high_half), x)); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +CUDA_ATOMIC_WRAPPER(Max, float16) { + if (*address >= val) { + return *address; + } + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // the float16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, max_to_low_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // the float16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, max_to_high_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old >> 16; + return ret; + } +} + +/* + CUDA_ATOMIC_WRAPPER(Max, float16) { + typedef unsigned short int FT; + half* address_h = &(address->to_half()); + half val_h = val.to_half(); + FT* addr_as_ui = reinterpret_cast(address_h); + FT old = *addr_as_ui; + FT assumed = old; + do { + assumed = old; + old = atomicCASshort(addr_as_ui, assumed, ) + } +} +*/ +#endif + // For atomicMin USE_CUDA_ATOMIC(Min, int); USE_CUDA_ATOMIC(Min, unsigned int); @@ -503,5 +609,54 @@ CUDA_ATOMIC_WRAPPER(Min, double) { return __longlong_as_double(old); } +inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) { + float16 low_half; + // the float16 in lower 16bits + low_half.x = static_cast(val & 0xFFFFu); + low_half = static_cast(min(static_cast(low_half), x)); + return (val & 0xFFFF0000u) | low_half.x; +} + +inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) { + float16 high_half; + // the float16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = static_cast(min(static_cast(high_half), x)); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); +} + +#ifdef PADDLE_CUDA_FP16 +CUDA_ATOMIC_WRAPPER(Min, float16) { + if (*address <= val) { + return *address; + } + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t assumed; + if (((uintptr_t)address & 0x02) == 0) { + // the float16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, min_to_low_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old & 0xFFFFu; + return ret; + } else { + // the float16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, min_to_high_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old >> 16; + return ret; + } +} +#endif + } // namespace platform } // namespace paddle diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h index 4be92ae18629c..0397d0d207f5b 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h @@ -119,7 +119,7 @@ __global__ void ManipulateMeanCUDAKernel(T* output, CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { int64_t c_index = i / slice_size; if (*(count + c_index) > 1) { - *(output + i) = *(output + i) / *(count + c_index); + *(output + i) = *(output + i) / static_cast(*(count + c_index)); } } } @@ -140,8 +140,8 @@ __global__ void ManipulateMeanGradCUDAKernel(const T* params, IndexT dst_i = dst_indices[indices_i]; int64_t in_i = src_i * slice_size + slice_i; int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd(output + out_i, - *(params + in_i) / dst_count[src_i]); + paddle::platform::CudaAtomicAdd( + output + out_i, *(params + in_i) / static_cast(dst_count[src_i])); } } diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index b035a52aece41..b098783176cc2 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -160,7 +160,8 @@ __global__ void ManipulateMeanGradCUDAKernelForMulX(const T* out_grad_data, int64_t o_add = use_bcast ? l_bcastoff[tx] : tx; int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; T val = out_grad_off[o_add] * e_off[e_add]; - paddle::platform::CudaAtomicAdd(x_grad_off + tx, val / dst_count[src]); + paddle::platform::CudaAtomicAdd(x_grad_off + tx, + val / static_cast(dst_count[src])); tx += stride_x; } ty += stride_y; @@ -256,8 +257,9 @@ __global__ void ManipulateMeanGradCUDAKernelForAddE(const T* out_grad_data, const T* out_grad_off = out_grad_data + dst * out_len; while (tx < out_len) { int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; - paddle::platform::CudaAtomicAdd(e_grad_off + e_add, - out_grad_off[tx] / dst_count[dst]); + paddle::platform::CudaAtomicAdd( + e_grad_off + e_add, + out_grad_off[tx] / static_cast(dst_count[dst])); tx += stride_x; } ty += stride_y; @@ -295,7 +297,8 @@ __global__ void ManipulateMeanGradCUDAKernelForMulE(const T* x_data, int64_t x_add = use_bcast ? l_bcastoff[tx] : tx; int64_t e_add = use_bcast ? r_bcastoff[tx] : tx; paddle::platform::CudaAtomicAdd( - e_grad_off + e_add, out_grad_off[tx] * x_off[x_add] / dst_count[dst]); + e_grad_off + e_add, + out_grad_off[tx] * x_off[x_add] / static_cast(dst_count[dst])); tx += stride_x; } ty += stride_y; @@ -338,10 +341,12 @@ __global__ void ManipulateMinMaxGradCUDAKernelForAdd(const T* x_data, int64_t x_add = use_bcast ? xbcast_off[tx] : tx; int64_t e_add = use_bcast ? ebcast_off[tx] : tx; T val = x_off[x_add] + e_off[e_add]; - paddle::platform::CudaAtomicAdd(x_grad_off + x_add, - out_grad_off[tx] * (val == out_off[tx])); - paddle::platform::CudaAtomicAdd(e_grad_off + e_add, - out_grad_off[tx] * (val == out_off[tx])); + paddle::platform::CudaAtomicAdd( + x_grad_off + x_add, + out_grad_off[tx] * static_cast(val == out_off[tx])); + paddle::platform::CudaAtomicAdd( + e_grad_off + e_add, + out_grad_off[tx] * static_cast(val == out_off[tx])); tx += stride_x; } ty += stride_y; @@ -386,10 +391,10 @@ __global__ void ManipulateMinMaxGradCUDAKernelForMul(const T* x_data, T val = x_off[x_add] * e_off[e_add]; paddle::platform::CudaAtomicAdd( x_grad_off + x_add, - out_grad_off[tx] * (val == out_off[tx]) * e_off[e_add]); + out_grad_off[tx] * static_cast(val == out_off[tx]) * e_off[e_add]); paddle::platform::CudaAtomicAdd( e_grad_off + e_add, - out_grad_off[tx] * (val == out_off[tx]) * x_off[x_add]); + out_grad_off[tx] * static_cast(val == out_off[tx]) * x_off[x_add]); tx += stride_x; } ty += stride_y; diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu index 994f6b15b9675..c4f818b8601be 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu @@ -609,4 +609,5 @@ PD_REGISTER_KERNEL(graph_send_ue_recv_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu index 41ec8169e790b..e57abc5e758ee 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu @@ -333,4 +333,5 @@ PD_REGISTER_KERNEL(graph_send_ue_recv, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16) {} diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py index 9b779b152bb99..7af4a592829db 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py @@ -16,6 +16,7 @@ import numpy as np import paddle import paddle.fluid as fluid +import paddle.fluid.core as core from paddle.fluid.framework import _test_eager_guard from op_test import OpTest @@ -805,6 +806,35 @@ def test_compute_all_with_max(self): np.allclose(np_res, paddle_res, atol=1e-6), "two value is\ {}\n{}, check diff!".format(np_res, paddle_res)) + def test_compute_all_with_max_fp16(self): + paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), + dtype="float16") + e = paddle.ones(shape=[4, 1], dtype="float16") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + + res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "add", "max") + res_sub = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "sub", "max") + res_mul = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "mul", "max") + res_div = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "div", "max") + res = [res_add, res_sub, res_mul, res_div] + + np_add = np.array([[1, 3, 4], [3, 7, 8], [2, 5, 6]], dtype="float16") + np_sub = np.array([[-1, 1, 2], [1, 5, 6], [0, 3, 4]], dtype="float16") + np_mul = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float16") + np_div = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float16") + + self.assertTrue(np.allclose(np_sub, res_sub, atol=1e-6)) + for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div], res): + self.assertTrue( + np.allclose(np_res, paddle_res, atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_res, paddle_res)) + def test_compute_all_with_min(self): paddle.disable_static() x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), @@ -828,12 +858,53 @@ def test_compute_all_with_min(self): np_mul = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32") np_div = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32") - self.assertTrue(np.allclose(np_sub, res_sub, atol=1e-6)) for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div], res): self.assertTrue( np.allclose(np_res, paddle_res, atol=1e-6), "two value is\ {}\n{}, check diff!".format(np_res, paddle_res)) + def test_compute_all_with_min_fp16(self): + paddle.disable_static() + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, + 7]]), + dtype="float16") + e = paddle.ones(shape=[4, 1], dtype="float16") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), + dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), + dtype="int32") + res_add = paddle.geometric.send_ue_recv(x, e, src_index, + dst_index, "add", "min") + res_sub = paddle.geometric.send_ue_recv(x, e, src_index, + dst_index, "sub", "min") + res_mul = paddle.geometric.send_ue_recv(x, e, src_index, + dst_index, "mul", "min") + res_div = paddle.geometric.send_ue_recv(x, e, src_index, + dst_index, "div", "min") + res = [res_add, res_sub, res_mul, res_div] + + np_add = np.array([[1, 3, 4], [1, 3, 4], [2, 5, 6]], + dtype="float16") + np_sub = np.array([[-1, 1, 2], [-1, 1, 2], [0, 3, 4]], + dtype="float16") + np_mul = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], + dtype="float16") + np_div = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], + dtype="float16") + + for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div], + res): + self.assertTrue( + np.allclose(np_res, paddle_res, atol=1e-6), + "two value is\ + {}\n{}, check diff!".format(np_res, paddle_res)) + + def test_compute_all_with_max_fp16(self): + paddle.disable_static() + def test_out_size_tensor_static(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): From 9e21001d272b0c4e92ef6b70d6c6e109658e2a38 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Fri, 29 Jul 2022 08:32:10 +0000 Subject: [PATCH 41/51] add unittest --- .../unittests/test_graph_send_ue_recv_op.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py index 7af4a592829db..4ec444cdd9676 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py @@ -902,8 +902,21 @@ def test_compute_all_with_min_fp16(self): "two value is\ {}\n{}, check diff!".format(np_res, paddle_res)) - def test_compute_all_with_max_fp16(self): + def test_reshape_lhs_rhs(self): paddle.disable_static() + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), + dtype="float32") + x = x.reshape(shape=[3, 3, 1]) + e = paddle.ones([4, 1], dtype="float32") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + "add", "min") + np_add = np.array([[1, 3, 4], [1, 3, 4], [2, 5, 6]], + dtype="float16").reshape([3, 3, 1]) + self.assertTrue( + np.allclose(np_add, res_add, atol=1e-6), "two value is\ + {}\n{}, check diff!".format(np_add, res_add)) def test_out_size_tensor_static(self): paddle.enable_static() @@ -945,7 +958,10 @@ def test_api_eager_dygraph(self): self.test_compute_all_with_sum() self.test_compute_all_with_mean() self.test_compute_all_with_max() + self.test_compute_all_with_max_fp16() self.test_compute_all_with_min() + self.test_compute_all_with_min_fp16() + self.test_reshape_lhs_rhs() if __name__ == "__main__": From 471b0518eeee130a861a0d6a3e4240be4474dd98 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Fri, 29 Jul 2022 08:39:28 +0000 Subject: [PATCH 42/51] add fp16 support for graph_send_recv --- .../platform/device/gpu/gpu_primitives.h | 59 +------------------ .../phi/kernels/gpu/graph_send_recv_funcs.h | 3 +- .../gpu/graph_send_recv_grad_kernel.cu | 3 +- .../phi/kernels/gpu/graph_send_recv_kernel.cu | 3 +- 4 files changed, 7 insertions(+), 61 deletions(-) diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index 6edec300b44a7..bff05a11daeda 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -420,48 +420,6 @@ CUDA_ATOMIC_WRAPPER(Max, double) { } #ifdef PADDLE_CUDA_FP16 -template -struct Cast { - // unused. - static __device__ __forceinline__ unsigned int Encode(T val) { - return static_cast(val); - } - static __device__ __forceinline__ T Decode(unsigned int code) { - return static_cast(code); - } -}; - -template <> -struct Cast { - static __device__ __forceinline__ unsigned short int Encode( // NOLINT - half val) { - return __half_as_ushort(val); - } - static __device__ __forceinline__ half - Decode(unsigned short int code) { // NOLINT - return __ushort_as_half(code); - } -}; - -static __device__ __forceinline__ unsigned short int atomicCASshort( // NOLINT - unsigned short int *address, // NOLINT - unsigned short int compare, // NOLINT - unsigned short int val) { // NOLINT - static_assert(CUDA_VERSION >= 10000, "Requires at least CUDA 10"); -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__) >= 700) - return atomicCAS(address, compare, val); -#else - (void)address; - (void)compare; - (void)val; - printf( - "Atomic operations are not supported for half precision (FP16) " - "on this GPU.\n"); - __trap(); - return val; -#endif -} - inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) { float16 low_half; // the float16 in lower 16bits @@ -508,21 +466,6 @@ CUDA_ATOMIC_WRAPPER(Max, float16) { return ret; } } - -/* - CUDA_ATOMIC_WRAPPER(Max, float16) { - typedef unsigned short int FT; - half* address_h = &(address->to_half()); - half val_h = val.to_half(); - FT* addr_as_ui = reinterpret_cast(address_h); - FT old = *addr_as_ui; - FT assumed = old; - do { - assumed = old; - old = atomicCASshort(addr_as_ui, assumed, ) - } -} -*/ #endif // For atomicMin @@ -609,6 +552,7 @@ CUDA_ATOMIC_WRAPPER(Min, double) { return __longlong_as_double(old); } +#ifdef PADDLE_CUDA_FP16 inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) { float16 low_half; // the float16 in lower 16bits @@ -625,7 +569,6 @@ inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) { return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); } -#ifdef PADDLE_CUDA_FP16 CUDA_ATOMIC_WRAPPER(Min, float16) { if (*address <= val) { return *address; diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h index 0397d0d207f5b..e352c50bdc283 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h @@ -164,7 +164,8 @@ __global__ void ManipulateMinMaxGradCUDAKernel(const T* params, int64_t out_i = dst_i * slice_size + slice_i; paddle::platform::CudaAtomicAdd( output + out_i, - *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); + *(params + in_i) * + static_cast(*(ptr_input + out_i) == *(ptr_output + in_i))); } } diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu index e78fb7892ed7d..2197af60d6431 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu @@ -142,4 +142,5 @@ PD_REGISTER_KERNEL(graph_send_recv_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu index ab5a8ffbb48ea..e34271b7a70bf 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu @@ -183,4 +183,5 @@ PD_REGISTER_KERNEL(graph_send_recv, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16) {} From 4b6d6ab56abd5a417fd4e849b6492fa062dcd9f4 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 2 Aug 2022 09:58:18 +0000 Subject: [PATCH 43/51] fix unittest fp16 bug --- .../unittests/test_graph_send_ue_recv_op.py | 58 +++++++++++-------- 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py index 4ec444cdd9676..ff0f4be685ff0 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py @@ -808,32 +808,44 @@ def test_compute_all_with_max(self): def test_compute_all_with_max_fp16(self): paddle.disable_static() - x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), - dtype="float16") - e = paddle.ones(shape=[4, 1], dtype="float16") - src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") - dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, + 7]]), + dtype="float16") + e = paddle.ones(shape=[4, 1], dtype="float16") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), + dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), + dtype="int32") - res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, - "add", "max") - res_sub = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, - "sub", "max") - res_mul = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, - "mul", "max") - res_div = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, - "div", "max") - res = [res_add, res_sub, res_mul, res_div] + res_add = paddle.geometric.send_ue_recv(x, e, src_index, + dst_index, "add", "max") + res_sub = paddle.geometric.send_ue_recv(x, e, src_index, + dst_index, "sub", "max") + res_mul = paddle.geometric.send_ue_recv(x, e, src_index, + dst_index, "mul", "max") + res_div = paddle.geometric.send_ue_recv(x, e, src_index, + dst_index, "div", "max") + res = [res_add, res_sub, res_mul, res_div] - np_add = np.array([[1, 3, 4], [3, 7, 8], [2, 5, 6]], dtype="float16") - np_sub = np.array([[-1, 1, 2], [1, 5, 6], [0, 3, 4]], dtype="float16") - np_mul = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float16") - np_div = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float16") + np_add = np.array([[1, 3, 4], [3, 7, 8], [2, 5, 6]], + dtype="float16") + np_sub = np.array([[-1, 1, 2], [1, 5, 6], [0, 3, 4]], + dtype="float16") + np_mul = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]], + dtype="float16") + np_div = np.array([[0, 2, 3], [2, 6, 7], [1, 4, 5]], + dtype="float16") - self.assertTrue(np.allclose(np_sub, res_sub, atol=1e-6)) - for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div], res): - self.assertTrue( - np.allclose(np_res, paddle_res, atol=1e-6), "two value is\ - {}\n{}, check diff!".format(np_res, paddle_res)) + self.assertTrue(np.allclose(np_sub, res_sub, atol=1e-6)) + for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div], + res): + self.assertTrue( + np.allclose(np_res, paddle_res, atol=1e-6), + "two value is\ + {}\n{}, check diff!".format(np_res, paddle_res)) def test_compute_all_with_min(self): paddle.disable_static() From 917b029aa72609641c0c0d69afd9e0e5b74960fd Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 9 Aug 2022 11:50:26 +0000 Subject: [PATCH 44/51] change OutSizeTensor to Out_size --- paddle/fluid/operators/graph_send_ue_recv_op.cc | 2 +- paddle/phi/ops/compat/graph_send_ue_recv_sig.cc | 4 ++-- python/paddle/geometric/message_passing/send_recv.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/graph_send_ue_recv_op.cc b/paddle/fluid/operators/graph_send_ue_recv_op.cc index f2a3a84dc50e7..52f015aca1f84 100644 --- a/paddle/fluid/operators/graph_send_ue_recv_op.cc +++ b/paddle/fluid/operators/graph_send_ue_recv_op.cc @@ -62,7 +62,7 @@ class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker { "The input edge weight tensor, data type should be same with X"); AddInput("Src_index", "The source index tensor."); AddInput("Dst_index", "The destination index tensor."); - AddInput("OutSizeTensor", + AddInput("Out_size", "(Tensor, optional). The 0th dimension of the output." "It has a higher priority than Attr(out_size).") .AsDispensable(); diff --git a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc index 7270473c1fb6b..115c738b8577d 100644 --- a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc @@ -18,10 +18,10 @@ namespace phi { KernelSignature GraphSendUERecvOpArgumentMapping( const ArgumentMappingContext& ctx) { - if (ctx.HasInput("OutSizeTensor")) { + if (ctx.HasInput("Out_size")) { return KernelSignature("graph_send_ue_recv", {"X", "E", "Src_index", "Dst_index"}, - {"compute_type", "pool_type", "OutSizeTensor"}, + {"compute_type", "pool_type", "Out_size"}, {"Out", "Dst_count"}); } else { return KernelSignature("graph_send_ue_recv", diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py index 1ebfe3e94f40c..dd538d7041337 100644 --- a/python/paddle/geometric/message_passing/send_recv.py +++ b/python/paddle/geometric/message_passing/send_recv.py @@ -19,7 +19,7 @@ from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype from paddle import _C_ops -from .utils import convert_out_size_to_list, get_out_size_tensor_inputs +from .utils import convert_out_size_to_list, get_out_size_tensor_inputs, reshape_lhs_rhs def send_u_recv(x, From bb8517a0c01afcb0c30993a38896367bdbbb5eec Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 9 Aug 2022 14:03:12 +0000 Subject: [PATCH 45/51] move E to Y --- .../fluid/operators/graph_send_ue_recv_op.cc | 14 +- paddle/fluid/pybind/op_function_generator.h | 2 +- paddle/phi/api/yaml/legacy_api.yaml | 2 +- paddle/phi/api/yaml/legacy_backward.yaml | 8 +- paddle/phi/infermeta/multiary.cc | 28 +- paddle/phi/infermeta/multiary.h | 2 +- .../cpu/graph_send_ue_recv_grad_kernel.cc | 2 +- .../kernels/cpu/graph_send_ue_recv_kernel.cc | 22 +- .../kernels/gpu/graph_send_ue_recv_funcs.h | 2 +- .../gpu/graph_send_ue_recv_grad_kernel.cu | 14 +- .../kernels/gpu/graph_send_ue_recv_kernel.cu | 28 +- .../kernels/graph_send_ue_recv_grad_kernel.h | 4 +- .../phi/kernels/graph_send_ue_recv_kernel.h | 2 +- ...l_impl.h => graph_messaage_passing_impl.h} | 0 .../phi/ops/compat/graph_send_ue_recv_sig.cc | 8 +- .../unittests/test_graph_send_ue_recv_op.py | 250 +++++++++--------- .../geometric/message_passing/send_recv.py | 20 +- 17 files changed, 199 insertions(+), 209 deletions(-) rename paddle/phi/kernels/impl/{graph_send_ue_recv_kernel_impl.h => graph_messaage_passing_impl.h} (100%) diff --git a/paddle/fluid/operators/graph_send_ue_recv_op.cc b/paddle/fluid/operators/graph_send_ue_recv_op.cc index 52f015aca1f84..696b2656a7052 100644 --- a/paddle/fluid/operators/graph_send_ue_recv_op.cc +++ b/paddle/fluid/operators/graph_send_ue_recv_op.cc @@ -40,8 +40,8 @@ class GraphSendUERecvGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { auto in_dims = ctx->GetInputDim("X"); ctx->SetOutputDim(framework::GradVarName("X"), in_dims); - auto e_dims = ctx->GetInputDim("E"); - ctx->SetOutputDim(framework::GradVarName("E"), e_dims); + auto y_dims = ctx->GetInputDim("Y"); + ctx->SetOutputDim(framework::GradVarName("Y"), y_dims); } protected: @@ -58,7 +58,7 @@ class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "The input tensor with data type float32, float64, int32, int64."); - AddInput("E", + AddInput("Y", "The input edge weight tensor, data type should be same with X"); AddInput("Src_index", "The source index tensor."); AddInput("Dst_index", "The destination index tensor."); @@ -90,13 +90,13 @@ class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Graph Learning Send_UE_Recv combine operator. -$Out = Recv(Compute(Send(X, Src_index), E, compute_type), Dst_index, pool_type)$ +$Out = Recv(Compute(Send(X, Src_index), Y, compute_type), Dst_index, pool_type)$ This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `X` as the input tensor, we first use `src_index` to gather corresponding data. -Then the gather data should compute with `E` in different compute_types, like add, sub, mul, and div, +Then the gather data should compute with `Y` in different compute_types, like add, sub, mul, and div, and get the computation result. Then, use `dst_index` to update the corresponding position of output tensor in different pooling types, like sum, mean, max, or min. @@ -113,7 +113,7 @@ class GraphSendUERecvGradOpMaker : public framework::SingleGradOpMaker { void Apply(GradOpPtr op) const override { op->SetType("graph_send_ue_recv_grad"); op->SetInput("X", this->Input("X")); - op->SetInput("E", this->Input("E")); + op->SetInput("Y", this->Input("Y")); op->SetInput("Src_index", this->Input("Src_index")); op->SetInput("Dst_index", this->Input("Dst_index")); @@ -128,7 +128,7 @@ class GraphSendUERecvGradOpMaker : public framework::SingleGradOpMaker { op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - op->SetOutput(framework::GradVarName("E"), this->InputGrad("E")); + op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y")); op->SetAttrMap(this->Attrs()); } }; diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h index f098bdf4a897b..ba0f872cb7449 100644 --- a/paddle/fluid/pybind/op_function_generator.h +++ b/paddle/fluid/pybind/op_function_generator.h @@ -226,7 +226,7 @@ std::map> op_ins_map = { "Mean3", "Var3"}}, {"graph_send_recv", {"X", "Src_index", "Dst_index", "Out_size"}}, - {"graph_send_ue_recv", {"X", "E", "Src_index", "Dst_index", "Out_size"}}, + {"graph_send_ue_recv", {"X", "Y", "Src_index", "Dst_index", "Out_size"}}, }; // NOTE(zhiqiu): Like op_ins_map. diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index 5f827f1aede1c..c7c2f6f0152f9 100755 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -1071,7 +1071,7 @@ backward : graph_send_recv_grad - api : graph_send_ue_recv - args : (Tensor x, Tensor e, Tensor src_index, Tensor dst_index, str compute_type, str pool_type, IntArray out_size) + args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str compute_type, str pool_type, IntArray out_size) output : Tensor(out), Tensor(dst_count) infer_meta : func : GraphSendUERecvInferMeta diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 9891b06d00c9c..31ddba838f512 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -953,12 +953,12 @@ optional: out, dst_count - backward_api : graph_send_ue_recv_grad - forward : graph_send_ue_recv (Tensor x, Tensor e, Tensor src_index, Tensor dst_index, str compute_type, str pool_type, IntArray out_size) -> Tensor(out), Tensor(dst_count) - args : (Tensor x, Tensor e, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str compute_type, str pool_type) - output : Tensor(x_grad), Tensor(e_grad) + forward : graph_send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str compute_type, str pool_type, IntArray out_size) -> Tensor(out), Tensor(dst_count) + args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str compute_type, str pool_type) + output : Tensor(x_grad), Tensor(y_grad) infer_meta : func : GeneralBinaryGradInferMeta - param : [x, e] + param : [x, y] kernel : func : graph_send_ue_recv_grad data_type : out_grad diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 292e81d88ec8b..53076d1a5d127 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2600,7 +2600,7 @@ void Yolov3LossInferMeta(const MetaTensor& x, } void GraphSendUERecvInferMeta(const MetaTensor& x, - const MetaTensor& e, + const MetaTensor& y, const MetaTensor& src_index, const MetaTensor& dst_index, const std::string& compute_type, @@ -2647,43 +2647,43 @@ void GraphSendUERecvInferMeta(const MetaTensor& x, phi::errors::InvalidArgument( "Src_index and Dst_index should have the same shape.")); - auto e_dims = e.dims(); + auto y_dims = y.dims(); PADDLE_ENFORCE_EQ( - e_dims[0], + y_dims[0], src_index_dims[0], phi::errors::InvalidArgument( - "Expect Input E to have size %d as Src_index on the first dimension, " + "Expect Input Y to have size %d as Src_index on the first dimension, " "but we get %d", src_index_dims[0], - e_dims[0])); + y_dims[0])); auto x_dims = x.dims(); if (pool_type == "MEAN") { - dst_count->set_dims({x_dims[0]}); + dst_count->set_dims({-1}); dst_count->set_dtype(DataType::INT32); } // Infer out's shape according to x and e(need broadcasting condition) out->set_dtype(x.dtype()); auto x_dims1 = phi::vectorize(x_dims); - auto e_dims1 = phi::vectorize(e_dims); + auto y_dims1 = phi::vectorize(y_dims); std::vector x_dims2(x_dims1.begin() + 1, x_dims1.end()); - std::vector e_dims2(e_dims1.begin() + 1, e_dims1.end()); + std::vector y_dims2(y_dims1.begin() + 1, y_dims1.end()); - int max_dim = std::max(x_dims2.size(), e_dims2.size()); - int axis = std::abs(static_cast(x_dims2.size() - e_dims2.size())); + int max_dim = std::max(x_dims2.size(), y_dims2.size()); + int axis = std::abs(static_cast(x_dims2.size() - y_dims2.size())); std::vector x_dims_array(max_dim); - std::vector e_dims_array(max_dim); + std::vector y_dims_array(max_dim); std::vector out_dims_array(max_dim); // Only need to broadcast dimensions other than the 0th dimension. phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2), - phi::make_ddim(e_dims2), + phi::make_ddim(y_dims2), x_dims_array.data(), - e_dims_array.data(), + y_dims_array.data(), out_dims_array.data(), max_dim, axis); - out_dims_array.insert(out_dims_array.begin(), x_dims[0]); + out_dims_array.insert(out_dims_array.begin(), -1); out->set_dims(phi::make_ddim(out_dims_array)); } diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index a5d23dc363414..66d8ad84a4378 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -467,7 +467,7 @@ void Yolov3LossInferMeta(const MetaTensor& x, MetaTensor* gt_match_mask); void GraphSendUERecvInferMeta(const MetaTensor& x, - const MetaTensor& e, + const MetaTensor& y, const MetaTensor& src_index, const MetaTensor& dst_index, const std::string& compute_type, diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc index 65bcee4d53dc0..55b778bf8e502 100644 --- a/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc @@ -24,7 +24,7 @@ #include "paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h" +#include "paddle/phi/kernels/impl/graph_messaage_passing_impl.h" #include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc index 64fc90a68c7fa..6f479c7deb3cf 100644 --- a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc @@ -22,7 +22,7 @@ #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h" -#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h" +#include "paddle/phi/kernels/impl/graph_messaage_passing_impl.h" namespace phi { @@ -118,21 +118,15 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, const int& index_size = src_index.dims()[0]; auto out_dims = out->dims(); int64_t memset_size = 1; + std::vector dims_ = phi::vectorize(out_dims); if (out_size <= 0) { - for (int i = 0; i < out_dims.size(); i++) { - memset_size *= out_dims[i]; - } + dims_[0] = x.dims()[0]; } else { - // set out dim following out_size. - std::vector dims_ = phi::vectorize(out_dims); - if (dims_.size() > 0) { - dims_[0] = out_size; - } - out->Resize(phi::make_ddim(dims_)); - memset_size = out_size; - for (int i = 1; i < out_dims.size(); ++i) { - memset_size *= out_dims[i]; - } + dims_[0] = out_size; + } + out->Resize(phi::make_ddim(dims_)); + for (size_t i = 0; i < dims_.size(); i++) { + memset_size *= dims_[i]; } ctx.template Alloc(out); diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index b098783176cc2..473fd04494238 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -22,7 +22,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h" +#include "paddle/phi/kernels/impl/graph_messaage_passing_impl.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu index c4f818b8601be..bdb4e327e9294 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu @@ -21,7 +21,7 @@ #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" #include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h" -#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h" +#include "paddle/phi/kernels/impl/graph_messaage_passing_impl.h" #include "paddle/phi/kernels/reduce_sum_kernel.h" namespace phi { @@ -558,7 +558,7 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper( template void GraphSendUERecvGradKernel(const Context& ctx, const DenseTensor& x, - const DenseTensor& e, + const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, const paddle::optional& out, @@ -567,20 +567,20 @@ void GraphSendUERecvGradKernel(const Context& ctx, const std::string& compute_type, const std::string& pool_type, DenseTensor* x_grad, - DenseTensor* e_grad) { + DenseTensor* y_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendUERecvGradOpCUDAKernelLaunchHelper( ctx, out_grad, x, - e, + y, src_index, dst_index, compute_type, pool_type, x_grad, - e_grad, + y_grad, dst_count.get_ptr(), out.get_ptr()); } else if (index_type == phi::DataType::INT64) { @@ -588,13 +588,13 @@ void GraphSendUERecvGradKernel(const Context& ctx, ctx, out_grad, x, - e, + y, src_index, dst_index, compute_type, pool_type, x_grad, - e_grad, + y_grad, dst_count.get_ptr(), out.get_ptr()); } diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu index e57abc5e758ee..4c83bd02473a1 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu @@ -15,7 +15,7 @@ #include "paddle/phi/kernels/graph_send_ue_recv_kernel.h" #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" #include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h" -#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h" +#include "paddle/phi/kernels/impl/graph_messaage_passing_impl.h" #include #include @@ -43,21 +43,17 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, const int& index_size = src_index.dims()[0]; auto out_dims = out->dims(); int64_t memset_size = 1; + std::vector dims_ = phi::vectorize(out_dims); if (out_size <= 0) { - for (int i = 0; i < out_dims.size(); i++) { - memset_size *= out_dims[i]; - } + dims_[0] = x.dims()[0]; } else { - std::vector dims_ = phi::vectorize(out_dims); - if (dims_.size() > 0) { - dims_[0] = out_size; - } - out->Resize(phi::make_ddim(dims_)); - memset_size = out_size; - for (int i = 1; i < out_dims.size(); ++i) { - memset_size *= out_dims[i]; - } + dims_[0] = out_size; } + out->Resize(phi::make_ddim(dims_)); + for (size_t i = 0; i < dims_.size(); i++) { + memset_size *= dims_[i]; + } + ctx.template Alloc(out); T* out_data = out->data(); const size_t& memset_bytes = memset_size * sizeof(T); @@ -287,7 +283,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, template void GraphSendUERecvKernel(const Context& ctx, const DenseTensor& x, - const DenseTensor& e, + const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& compute_type, @@ -301,7 +297,7 @@ void GraphSendUERecvKernel(const Context& ctx, GraphSendUERecvOpCUDAKernelLaunchHelper( ctx, x, - e, + y, src_index, dst_index, compute_type, @@ -313,7 +309,7 @@ void GraphSendUERecvKernel(const Context& ctx, GraphSendUERecvOpCUDAKernelLaunchHelper( ctx, x, - e, + y, src_index, dst_index, compute_type, diff --git a/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h index b3f60944211ef..f5c7ce9a8937e 100644 --- a/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h +++ b/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h @@ -23,7 +23,7 @@ namespace phi { template void GraphSendUERecvGradKernel(const Context& ctx, const DenseTensor& x, - const DenseTensor& e, + const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, const paddle::optional& out, @@ -32,5 +32,5 @@ void GraphSendUERecvGradKernel(const Context& ctx, const std::string& compute_type, const std::string& pool_type, DenseTensor* x_grad, - DenseTensor* e_grad); + DenseTensor* y_grad); } // namespace phi diff --git a/paddle/phi/kernels/graph_send_ue_recv_kernel.h b/paddle/phi/kernels/graph_send_ue_recv_kernel.h index 4cec8c85d2a95..efb93ab47c93c 100644 --- a/paddle/phi/kernels/graph_send_ue_recv_kernel.h +++ b/paddle/phi/kernels/graph_send_ue_recv_kernel.h @@ -23,7 +23,7 @@ namespace phi { template void GraphSendUERecvKernel(const Context& ctx, const DenseTensor& x, - const DenseTensor& e, + const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& compute_type, diff --git a/paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_messaage_passing_impl.h similarity index 100% rename from paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h rename to paddle/phi/kernels/impl/graph_messaage_passing_impl.h diff --git a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc index 115c738b8577d..a4cd6f4a150b1 100644 --- a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc @@ -20,12 +20,12 @@ KernelSignature GraphSendUERecvOpArgumentMapping( const ArgumentMappingContext& ctx) { if (ctx.HasInput("Out_size")) { return KernelSignature("graph_send_ue_recv", - {"X", "E", "Src_index", "Dst_index"}, + {"X", "Y", "Src_index", "Dst_index"}, {"compute_type", "pool_type", "Out_size"}, {"Out", "Dst_count"}); } else { return KernelSignature("graph_send_ue_recv", - {"X", "E", "Src_index", "Dst_index"}, + {"X", "Y", "Src_index", "Dst_index"}, {"compute_type", "pool_type", "out_size"}, {"Out", "Dst_count"}); } @@ -35,9 +35,9 @@ KernelSignature GraphSendUERecvGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "graph_send_ue_recv_grad", - {"X", "E", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, + {"X", "Y", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, {"compute_type", "pool_type"}, - {"X@GRAD", "E@GRAD"}); + {"X@GRAD", "Y@GRAD"}); } } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py index ff0f4be685ff0..414cc6c639714 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -41,17 +41,17 @@ def get_broadcast_shape(shp1, shp2): class BroadCastInfo(object): - def __init__(self, x_shape, e_shape): + def __init__(self, x_shape, y_shape): self.x_shape = x_shape - self.e_shape = e_shape + self.y_shape = y_shape self.calculate_bcastinfo() def use_bcast(self): - if len(self.x_shape) != len(self.e_shape): + if len(self.x_shape) != len(self.y_shape): return True for i in range(1, len(self.x_shape)): - if self.x_shape[i] != self.e_shape[i]: + if self.x_shape[i] != self.y_shape[i]: return True return False @@ -60,12 +60,12 @@ def calculate_bcastinfo(self): rhs_len = 1 for i in range(1, len(self.x_shape)): lhs_len *= self.x_shape[i] - for i in range(1, len(self.e_shape)): - rhs_len *= self.e_shape[i] + for i in range(1, len(self.y_shape)): + rhs_len *= self.y_shape[i] use_b = self.use_bcast() if use_b: - max_ndim = max(len(self.x_shape), len(self.e_shape)) - 1 + max_ndim = max(len(self.x_shape), len(self.y_shape)) - 1 out_len = 1 stride_l = stride_r = 1 lhs_offset = [0] @@ -73,8 +73,8 @@ def calculate_bcastinfo(self): for j in range(0, max_ndim): dl = 1 if (len(self.x_shape) - 1 - j) < 1 \ else self.x_shape[len(self.x_shape) - 1 - j] - dr = 1 if (len(self.e_shape) - 1 - j) < 1 \ - else self.e_shape[len(self.e_shape) - 1 - j] + dr = 1 if (len(self.y_shape) - 1 - j) < 1 \ + else self.y_shape[len(self.y_shape) - 1 - j] for i in range(1, max(dl, dr)): for k in range(0, out_len): lhs_offset.append(lhs_offset[k] + i * @@ -99,7 +99,7 @@ def calculate_bcastinfo(self): def compute_graph_send_ue_recv_for_sum(inputs, attributes): x = inputs['X'] - e = inputs['E'] + y = inputs['Y'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] compute_type = attributes['compute_type'] @@ -107,22 +107,22 @@ def compute_graph_send_ue_recv_for_sum(inputs, attributes): gather_x = x[src_index] out_shp = [ x.shape[0], - ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) + ] + get_broadcast_shape(x.shape[1:], y.shape[1:]) results = np.zeros(out_shp, dtype=x.dtype) # Calculate forward output. if compute_type == 'ADD': - x_compute_e = gather_x + e + x_compute_y = gather_x + y elif compute_type == 'MUL': - x_compute_e = gather_x * e + x_compute_y = gather_x * y for index, s_id in enumerate(dst_index): - results[s_id, :] += x_compute_e[index, :] + results[s_id, :] += x_compute_y[index, :] return results def compute_graph_send_ue_recv_for_mean(inputs, attributes): x = inputs['X'] - e = inputs['E'] + y = inputs['Y'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] compute_type = attributes['compute_type'] @@ -130,17 +130,17 @@ def compute_graph_send_ue_recv_for_mean(inputs, attributes): gather_x = x[src_index] out_shp = [ x.shape[0], - ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) + ] + get_broadcast_shape(x.shape[1:], y.shape[1:]) results = np.zeros(out_shp, dtype=x.dtype) # Calculate forward output. if compute_type == 'ADD': - x_compute_e = gather_x + e + x_compute_y = gather_x + y elif compute_type == 'MUL': - x_compute_e = gather_x * e + x_compute_y = gather_x * y count = np.zeros(out_shp[0], dtype=np.int32) for index, s_id in enumerate(dst_index): - results[s_id, :] += x_compute_e[index, :] + results[s_id, :] += x_compute_y[index, :] count[s_id] += 1 count_shape = [out_shp[0]] count_shape.extend([1] * len(out_shp[1:])) @@ -151,7 +151,7 @@ def compute_graph_send_ue_recv_for_mean(inputs, attributes): def compute_graph_send_ue_recv_for_max_min(inputs, attributes): x = inputs['X'] - e = inputs['E'] + y = inputs['Y'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] compute_type = attributes['compute_type'] @@ -160,91 +160,91 @@ def compute_graph_send_ue_recv_for_max_min(inputs, attributes): gather_x = x[src_index] out_shp = [ x.shape[0], - ] + get_broadcast_shape(x.shape[1:], e.shape[1:]) + ] + get_broadcast_shape(x.shape[1:], y.shape[1:]) results = np.zeros(out_shp, dtype=x.dtype) # Calculate forward output. if compute_type == 'ADD': - x_compute_e = gather_x + e + x_compute_y = gather_x + y elif compute_type == 'MUL': - x_compute_e = gather_x * e + x_compute_y = gather_x * y first_set = set() if pool_type == 'MAX': for index, s_id in enumerate(dst_index): if s_id not in first_set: - results[s_id, :] += x_compute_e[index, :] + results[s_id, :] += x_compute_y[index, :] first_set.add(s_id) else: results[s_id, :] = np.maximum(results[s_id, :], - x_compute_e[index, :]) + x_compute_y[index, :]) elif pool_type == 'MIN': for index, s_id in enumerate(dst_index): if s_id not in first_set: - results[s_id, :] += x_compute_e[index, :] + results[s_id, :] += x_compute_y[index, :] first_set.add(s_id) else: results[s_id, :] = np.minimum(results[s_id, :], - x_compute_e[index, :]) + x_compute_y[index, :]) else: raise ValueError("Invalid pool_type, only MAX, MIN supported!") # Calculate backward gradient. x_gradient = np.zeros_like(x) - e_gradient = np.zeros_like(e) - bcast_info = BroadCastInfo(x.shape, e.shape) + y_gradient = np.zeros_like(y) + bcast_info = BroadCastInfo(x.shape, y.shape) use_broadcast = bcast_info.use_broadcast for i in range(len(src_index)): forward_src_idx = src_index[i] forward_dst_idx = dst_index[i] x_off = x[forward_src_idx] - e_off = e[i] + y_off = y[i] out_off = results[forward_dst_idx] x_grad_off = x_gradient[forward_src_idx] - e_grad_off = e_gradient[i] + y_grad_off = y_gradient[i] for j in range(bcast_info.out_len): x_add = bcast_info.lhs_offset[j] if use_broadcast else j - e_add = bcast_info.rhs_offset[j] if use_broadcast else j + y_add = bcast_info.rhs_offset[j] if use_broadcast else j if compute_type == 'ADD': - if len(x_off.shape) == 1 and len(e_off.shape) == 1: - val = x_off[x_add] + e_off[e_add] + if len(x_off.shape) == 1 and len(y_off.shape) == 1: + val = x_off[x_add] + y_off[y_add] x_grad_off[x_add] += 1 * (val == out_off[j]) - e_grad_off[e_add] += 1 * (val == out_off[j]) + y_grad_off[y_add] += 1 * (val == out_off[j]) else: # For simplicity, we only check the situation of x_off.shape=2 x_add_0 = int(x_add / x_off.shape[1]) x_add_1 = int(x_add % x_off.shape[1]) - e_add_0 = int(e_add / e_off.shape[1]) - e_add_1 = int(e_add % e_off.shape[1]) + y_add_0 = int(y_add / y_off.shape[1]) + y_add_1 = int(y_add % y_off.shape[1]) out_add_0 = int(j / out_off.shape[1]) out_add_1 = int(j % out_off.shape[1]) - val = x_off[x_add_0][x_add_1] + e_off[e_add_0][e_add_1] + val = x_off[x_add_0][x_add_1] + y_off[y_add_0][y_add_1] x_grad_off[x_add_0][x_add_1] += 1 * ( val == out_off[out_add_0][out_add_1]) - e_grad_off[e_add_0][e_add_1] += 1 * ( + y_grad_off[y_add_0][y_add_1] += 1 * ( val == out_off[out_add_0][out_add_1]) elif compute_type == 'MUL': - if len(x_off.shape) == 1 and len(e_off.shape) == 1: - val = x_off[x_add] * e_off[e_add] - x_grad_off[x_add] += 1 * (val == out_off[j]) * e_off[e_add] - e_grad_off[e_add] += 1 * (val == out_off[j]) * x_off[x_add] + if len(x_off.shape) == 1 and len(y_off.shape) == 1: + val = x_off[x_add] * y_off[y_add] + x_grad_off[x_add] += 1 * (val == out_off[j]) * y_off[y_add] + y_grad_off[y_add] += 1 * (val == out_off[j]) * x_off[x_add] else: # For simplicity, we only check the situation of x_off.shape=2 x_add_0 = int(x_add / x_off.shape[1]) x_add_1 = int(x_add % x_off.shape[1]) - e_add_0 = int(e_add / e_off.shape[1]) - e_add_1 = int(e_add % e_off.shape[1]) + y_add_0 = int(y_add / y_off.shape[1]) + y_add_1 = int(y_add % y_off.shape[1]) out_add_0 = int(j / out_off.shape[1]) out_add_1 = int(j % out_off.shape[1]) - val = x_off[x_add_0][x_add_1] * e_off[e_add_0][e_add_1] + val = x_off[x_add_0][x_add_1] * y_off[y_add_0][y_add_1] x_grad_off[x_add_0][x_add_1] += 1 * ( val == out_off[out_add_0][out_add_1] - ) * e_off[e_add_0][e_add_1] - e_grad_off[e_add_0][e_add_1] += 1 * ( + ) * y_off[y_add_0][y_add_1] + y_grad_off[y_add_0][y_add_1] += 1 * ( val == out_off[out_add_0][out_add_1] ) * x_off[x_add_0][x_add_1] - gradients = [x_gradient / results.size, e_gradient / results.size] + gradients = [x_gradient / results.size, y_gradient / results.size] return results, gradients @@ -272,7 +272,7 @@ def setUp(self): self.set_config() self.inputs = { 'X': self.x, - 'E': self.e, + 'Y': self.y, 'Src_index': self.src_index, 'Dst_index': self.dst_index } @@ -284,7 +284,7 @@ def setUp(self): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -294,14 +294,14 @@ def test_check_output(self): self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'E'], 'Out', check_eager=True) + self.check_grad(['X', 'Y'], 'Out', check_eager=True) class TestSumCase1(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -312,7 +312,7 @@ class TestSumCase2(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((150, 1)).astype("float64") + self.y = np.random.random((150, 1)).astype("float64") index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -323,7 +323,7 @@ class TestSumCase3(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((150, 1)).astype("float64") + self.y = np.random.random((150, 1)).astype("float64") index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -334,7 +334,7 @@ class TestSumCase4(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") - self.e = np.random.random((15, 8, 1)).astype("float64") + self.y = np.random.random((15, 8, 1)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -345,7 +345,7 @@ class TestSumCase5(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") - self.e = np.random.random((15, 8, 1)).astype("float64") + self.y = np.random.random((15, 8, 1)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -356,7 +356,7 @@ class TestSumCase6(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -367,7 +367,7 @@ class TestSumCase7(TestGraphSendUERecvSumOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -384,7 +384,7 @@ def setUp(self): self.set_config() self.inputs = { 'X': self.x, - 'E': self.e, + 'Y': self.y, 'Src_index': self.src_index, 'Dst_index': self.dst_index } @@ -397,7 +397,7 @@ def setUp(self): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -407,14 +407,14 @@ def test_check_output(self): self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'E'], 'Out', check_eager=True) + self.check_grad(['X', 'Y'], 'Out', check_eager=True) class TestMeanCase1(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -425,7 +425,7 @@ class TestMeanCase2(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((150, 1)).astype("float64") + self.y = np.random.random((150, 1)).astype("float64") index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -436,7 +436,7 @@ class TestMeanCase3(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((150, 1)).astype("float64") + self.y = np.random.random((150, 1)).astype("float64") index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -447,7 +447,7 @@ class TestMeanCase4(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") - self.e = np.random.random((15, 8, 1)).astype("float64") + self.y = np.random.random((15, 8, 1)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -458,7 +458,7 @@ class TestMeanCase5(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") - self.e = np.random.random((15, 8, 1)).astype("float64") + self.y = np.random.random((15, 8, 1)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -469,7 +469,7 @@ class TestMeanCase6(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -480,7 +480,7 @@ class TestMeanCase7(TestGraphSendUERecvMeanOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -497,7 +497,7 @@ def setUp(self): self.set_config() self.inputs = { 'X': self.x, - 'E': self.e, + 'Y': self.y, 'Src_index': self.src_index, 'Dst_index': self.dst_index } @@ -510,7 +510,7 @@ def setUp(self): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -520,7 +520,7 @@ def test_check_output(self): self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'E'], + self.check_grad(['X', 'Y'], 'Out', user_defined_grads=self.gradients, check_eager=True) @@ -530,7 +530,7 @@ class TestMaxCase1(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -541,7 +541,7 @@ class TestMaxCase2(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((150, 1)).astype("float64") + self.y = np.random.random((150, 1)).astype("float64") index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -552,7 +552,7 @@ class TestMaxCase3(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((150, 1)).astype("float64") + self.y = np.random.random((150, 1)).astype("float64") index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -563,7 +563,7 @@ class TestMaxCase4(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") - self.e = np.random.random((15, 8, 1)).astype("float64") + self.y = np.random.random((15, 8, 1)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -574,7 +574,7 @@ class TestMaxCase5(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") - self.e = np.random.random((15, 8, 1)).astype("float64") + self.y = np.random.random((15, 8, 1)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -585,7 +585,7 @@ class TestMaxCase6(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -596,7 +596,7 @@ class TestMaxCase7(TestGraphSendUERecvMaxOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -613,7 +613,7 @@ def setUp(self): self.set_config() self.inputs = { 'X': self.x, - 'E': self.e, + 'Y': self.y, 'Src_index': self.src_index, 'Dst_index': self.dst_index } @@ -626,7 +626,7 @@ def setUp(self): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -636,7 +636,7 @@ def test_check_output(self): self.check_output(check_eager=True) def test_check_grad(self): - self.check_grad(['X', 'E'], + self.check_grad(['X', 'Y'], 'Out', user_defined_grads=self.gradients, check_eager=True) @@ -646,7 +646,7 @@ class TestMinCase1(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -657,7 +657,7 @@ class TestMinCase2(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((150, 1)).astype("float64") + self.y = np.random.random((150, 1)).astype("float64") index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -668,7 +668,7 @@ class TestMinCase3(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((10, 20)).astype("float64") - self.e = np.random.random((150, 1)).astype("float64") + self.y = np.random.random((150, 1)).astype("float64") index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -679,7 +679,7 @@ class TestMinCase4(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") - self.e = np.random.random((15, 8, 1)).astype("float64") + self.y = np.random.random((15, 8, 1)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -690,7 +690,7 @@ class TestMinCase5(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((10, 8, 5)).astype("float64") - self.e = np.random.random((15, 8, 1)).astype("float64") + self.y = np.random.random((15, 8, 1)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -701,7 +701,7 @@ class TestMinCase6(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -712,7 +712,7 @@ class TestMinCase7(TestGraphSendUERecvMinOp): def set_config(self): self.x = np.random.random((100, 1)).astype("float64") - self.e = np.random.random((15, 20)).astype("float64") + self.y = np.random.random((15, 20)).astype("float64") index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] @@ -725,17 +725,17 @@ def test_compute_all_with_sum(self): paddle.disable_static() x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") - e = paddle.ones(shape=[4, 1], dtype="float32") + y = paddle.ones(shape=[4, 1], dtype="float32") src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") - res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_add = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "add", "sum") - res_sub = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_sub = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "sub", "sum") - res_mul = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_mul = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "mul", "sum") - res_div = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_div = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "div", "sum") res = [res_add, res_sub, res_mul, res_div] @@ -753,17 +753,17 @@ def test_compute_all_with_mean(self): paddle.disable_static() x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") - e = paddle.ones(shape=[4, 1], dtype="float32") + y = paddle.ones(shape=[4, 1], dtype="float32") src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") - res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_add = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "add", "mean") - res_sub = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_sub = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "sub", "mean") - res_mul = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_mul = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "mul", "mean") - res_div = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_div = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "div", "mean") res = [res_add, res_sub, res_mul, res_div] @@ -781,17 +781,17 @@ def test_compute_all_with_max(self): paddle.disable_static() x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") - e = paddle.ones(shape=[4, 1], dtype="float32") + y = paddle.ones(shape=[4, 1], dtype="float32") src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") - res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_add = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "add", "max") - res_sub = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_sub = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "sub", "max") - res_mul = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_mul = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "mul", "max") - res_div = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_div = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "div", "max") res = [res_add, res_sub, res_mul, res_div] @@ -814,19 +814,19 @@ def test_compute_all_with_max_fp16(self): x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float16") - e = paddle.ones(shape=[4, 1], dtype="float16") + y = paddle.ones(shape=[4, 1], dtype="float16") src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") - res_add = paddle.geometric.send_ue_recv(x, e, src_index, + res_add = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "add", "max") - res_sub = paddle.geometric.send_ue_recv(x, e, src_index, + res_sub = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "sub", "max") - res_mul = paddle.geometric.send_ue_recv(x, e, src_index, + res_mul = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "mul", "max") - res_div = paddle.geometric.send_ue_recv(x, e, src_index, + res_div = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "div", "max") res = [res_add, res_sub, res_mul, res_div] @@ -851,17 +851,17 @@ def test_compute_all_with_min(self): paddle.disable_static() x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") - e = paddle.ones(shape=[4, 1], dtype="float32") + y = paddle.ones(shape=[4, 1], dtype="float32") src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") - res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_add = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "add", "min") - res_sub = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_sub = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "sub", "min") - res_mul = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_mul = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "mul", "min") - res_div = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_div = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "div", "min") res = [res_add, res_sub, res_mul, res_div] @@ -883,18 +883,18 @@ def test_compute_all_with_min_fp16(self): x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float16") - e = paddle.ones(shape=[4, 1], dtype="float16") + y = paddle.ones(shape=[4, 1], dtype="float16") src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") - res_add = paddle.geometric.send_ue_recv(x, e, src_index, + res_add = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "add", "min") - res_sub = paddle.geometric.send_ue_recv(x, e, src_index, + res_sub = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "sub", "min") - res_mul = paddle.geometric.send_ue_recv(x, e, src_index, + res_mul = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "mul", "min") - res_div = paddle.geometric.send_ue_recv(x, e, src_index, + res_div = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "div", "min") res = [res_add, res_sub, res_mul, res_div] @@ -919,10 +919,10 @@ def test_reshape_lhs_rhs(self): x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") x = x.reshape(shape=[3, 3, 1]) - e = paddle.ones([4, 1], dtype="float32") + y = paddle.ones([4, 1], dtype="float32") src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") - res_add = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_add = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "add", "min") np_add = np.array([[1, 3, 4], [1, 3, 4], [2, 5, 6]], dtype="float16").reshape([3, 3, 1]) @@ -934,14 +934,14 @@ def test_out_size_tensor_static(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): x = paddle.static.data(name="x", shape=[3, 3], dtype="float32") - e = paddle.static.data(name="e", shape=[3], dtype="float32") + y = paddle.static.data(name="y", shape=[3], dtype="float32") src_index = paddle.static.data(name="src", shape=[3], dtype="int32") dst_index = paddle.static.data(name="dst", shape=[3], dtype="int32") out_size = paddle.static.data(name="out_size", shape=[1], dtype="int32") - res_sum = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, + res_sum = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, "mul", "sum", out_size) exe = paddle.static.Executor(paddle.CPUPlace()) @@ -955,7 +955,7 @@ def test_out_size_tensor_static(self): ret = exe.run(feed={ 'x': data1, - 'e': data2, + 'y': data2, 'src': data3, 'dst': data4, 'out_size': data5, diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py index dd538d7041337..7279d9d031f90 100644 --- a/python/paddle/geometric/message_passing/send_recv.py +++ b/python/paddle/geometric/message_passing/send_recv.py @@ -188,7 +188,7 @@ def send_ue_recv(x, [1, 4, 5], [2, 6, 7]] - E = [1, 1, 1] + Y = [1, 1, 1] src_index = [0, 1, 2, 0] @@ -207,7 +207,7 @@ def send_ue_recv(x, [2, 5, 6]] Args: x (Tensor): The input node feature tensor, and the available data type is float32, float64, int32, int64. - e (Tensor): The input edge feature tensor, and the available data type is float32, float64, int32, int64. + y (Tensor): The input edge feature tensor, and the available data type is float32, float64, int32, int64. src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. The available data type is int32, int64. @@ -233,28 +233,28 @@ def send_ue_recv(x, import paddle x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") - e = paddle.to_tensor([1, 1, 1, 1], dtype="float32") + y = paddle.to_tensor([1, 1, 1, 1], dtype="float32") indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] - out = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, compute_type="add", pool_type="sum") + out = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, compute_type="add", pool_type="sum") # Outputs: [[1., 3., 4.], [4., 10., 12.], [2., 5., 6.]] x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") - e = paddle.to_tensor([1, 1, 1], dtype="float32") + y = paddle.to_tensor([1, 1, 1], dtype="float32") indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] out_size = paddle.max(dst_index) + 1 - out = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, compute_type="add", pool_type="sum", out_size=out_size) + out = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, compute_type="add", pool_type="sum", out_size=out_size) # Outputs: [[1., 3., 4.], [[4., 10., 12.]]] x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") - e = paddle.to_tensor([1, 1, 1], dtype="float32") + y = paddle.to_tensor([1, 1, 1], dtype="float32") indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] - out = paddle.geometric.send_ue_recv(x, e, src_index, dst_index, compute_type="add", pool_type="sum") + out = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, compute_type="add", pool_type="sum") # Outputs: [[1., 3., 4.], [4., 10., 12.], [0., 0., 0.]] """ @@ -299,7 +299,7 @@ def send_ue_recv(x, x, "X", ("float32", "float64", "int32", "int64", "float16"), "graph_send_ue_recv") check_variable_and_dtype( - y, "E", ("float32", "float64", "int32", "int64", "float16"), + y, "Y", ("float32", "float64", "int32", "int64", "float16"), "graph_send_ue_recv") check_variable_and_dtype(src_index, "Src_index", ("int32", "int64"), "graph_send_ue_recv") @@ -317,7 +317,7 @@ def send_ue_recv(x, dst_count = helper.create_variable_for_type_inference(dtype="int32", stop_gradient=True) - inputs = {"X": x, "E": y, "Src_index": src_index, "Dst_index": dst_index} + inputs = {"X": x, "Y": y, "Src_index": src_index, "Dst_index": dst_index} attrs = { "compute_type": compute_type.upper(), "pool_type": pool_type.upper() From 666bd68660241a0569fac5b3d0ed682c6abbe42a Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 10 Aug 2022 04:07:58 +0000 Subject: [PATCH 46/51] add copyright, fix comment --- .../fluid/platform/device/gpu/gpu_primitives.h | 16 ++++++++-------- .../kernels/impl/graph_messaage_passing_impl.h | 1 + 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index bff05a11daeda..b99d6de5dbbb4 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -422,7 +422,7 @@ CUDA_ATOMIC_WRAPPER(Max, double) { #ifdef PADDLE_CUDA_FP16 inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) { float16 low_half; - // the float16 in lower 16bits + // The float16 in lower 16bits low_half.x = static_cast(val & 0xFFFFu); low_half = static_cast(max(static_cast(low_half), x)); return (val & 0xFFFF0000u) | low_half.x; @@ -430,7 +430,7 @@ inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) { inline static __device__ uint32_t max_to_high_half(uint32_t val, float x) { float16 high_half; - // the float16 in higher 16bits + // The float16 in higher 16bits high_half.x = static_cast(val >> 16); high_half = static_cast(max(static_cast(high_half), x)); return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); @@ -447,7 +447,7 @@ CUDA_ATOMIC_WRAPPER(Max, float16) { uint32_t old = *address_as_ui; uint32_t assumed; if (((uintptr_t)address & 0x02) == 0) { - // the float16 value stay at lower 16 bits of the address. + // The float16 value stay at lower 16 bits of the address. do { assumed = old; old = atomicCAS(address_as_ui, assumed, max_to_low_half(assumed, val_f)); @@ -456,7 +456,7 @@ CUDA_ATOMIC_WRAPPER(Max, float16) { ret.x = old & 0xFFFFu; return ret; } else { - // the float16 value stay at higher 16 bits of the address. + // The float16 value stay at higher 16 bits of the address. do { assumed = old; old = atomicCAS(address_as_ui, assumed, max_to_high_half(assumed, val_f)); @@ -555,7 +555,7 @@ CUDA_ATOMIC_WRAPPER(Min, double) { #ifdef PADDLE_CUDA_FP16 inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) { float16 low_half; - // the float16 in lower 16bits + // The float16 in lower 16bits low_half.x = static_cast(val & 0xFFFFu); low_half = static_cast(min(static_cast(low_half), x)); return (val & 0xFFFF0000u) | low_half.x; @@ -563,7 +563,7 @@ inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) { inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) { float16 high_half; - // the float16 in higher 16bits + // The float16 in higher 16bits high_half.x = static_cast(val >> 16); high_half = static_cast(min(static_cast(high_half), x)); return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); @@ -580,7 +580,7 @@ CUDA_ATOMIC_WRAPPER(Min, float16) { uint32_t old = *address_as_ui; uint32_t assumed; if (((uintptr_t)address & 0x02) == 0) { - // the float16 value stay at lower 16 bits of the address. + // The float16 value stay at lower 16 bits of the address. do { assumed = old; old = atomicCAS(address_as_ui, assumed, min_to_low_half(assumed, val_f)); @@ -589,7 +589,7 @@ CUDA_ATOMIC_WRAPPER(Min, float16) { ret.x = old & 0xFFFFu; return ret; } else { - // the float16 value stay at higher 16 bits of the address. + // The float16 value stay at higher 16 bits of the address. do { assumed = old; old = atomicCAS(address_as_ui, assumed, min_to_high_half(assumed, val_f)); diff --git a/paddle/phi/kernels/impl/graph_messaage_passing_impl.h b/paddle/phi/kernels/impl/graph_messaage_passing_impl.h index 35e51fb930c8d..dc1477e77227b 100644 --- a/paddle/phi/kernels/impl/graph_messaage_passing_impl.h +++ b/paddle/phi/kernels/impl/graph_messaage_passing_impl.h @@ -1,4 +1,5 @@ // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright The DGL team. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From a480092e19ec1a6354ead9809bd4db7abf71e9ff Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 10 Aug 2022 09:38:07 +0000 Subject: [PATCH 47/51] review code --- .../cpu/graph_send_ue_recv_grad_kernel.cc | 50 +++++++++---------- .../kernels/cpu/graph_send_ue_recv_kernel.cc | 42 ++++++++-------- .../kernels/gpu/graph_send_ue_recv_funcs.h | 3 -- .../unittests/test_graph_send_ue_recv_op.py | 3 +- .../geometric/message_passing/send_recv.py | 12 ++--- 5 files changed, 54 insertions(+), 56 deletions(-) diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc index 55b778bf8e502..c3ae8563370f8 100644 --- a/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc @@ -356,42 +356,42 @@ void GraphSendUERecvGradOpKernelLaunchHelper( const Context& ctx, const DenseTensor& out_grad, const DenseTensor& x, - const DenseTensor& e, + const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& compute_type, const std::string& pool_type, DenseTensor* x_grad, - DenseTensor* e_grad, + DenseTensor* y_grad, const DenseTensor* dst_count = nullptr, const DenseTensor* out = nullptr) { const int& index_size = dst_index.dims()[0]; ctx.template Alloc(x_grad); T* x_grad_data = x_grad->data(); - ctx.template Alloc(e_grad); - T* e_grad_data = e_grad->data(); + ctx.template Alloc(y_grad); + T* y_grad_data = y_grad->data(); const auto& x_dims = x.dims(); - const auto& e_dims = e.dims(); - int64_t memset_size_x = 1, memset_size_e = 1; + const auto& y_dims = y.dims(); + int64_t memset_size_x = 1, memset_size_y = 1; int64_t slice_size = 1; for (int i = 0; i < x_dims.size(); i++) { memset_size_x *= x_dims[i]; if (i > 0) slice_size *= x_dims[i]; } - for (int i = 0; i < e_dims.size(); i++) { - memset_size_e *= e_dims[i]; + for (int i = 0; i < y_dims.size(); i++) { + memset_size_y *= y_dims[i]; } const size_t& memset_bytes_x = memset_size_x * sizeof(T); - const size_t& memset_bytes_e = memset_size_e * sizeof(T); + const size_t& memset_bytes_y = memset_size_y * sizeof(T); memset(x_grad_data, 0, memset_bytes_x); - memset(e_grad_data, 0, memset_bytes_e); + memset(y_grad_data, 0, memset_bytes_y); if (index_size == 0) return; const T* out_grad_data = out_grad.data(); const T* x_data = x.data(); - const T* e_data = e.data(); + const T* y_data = y.data(); const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index.data(); @@ -399,10 +399,10 @@ void GraphSendUERecvGradOpKernelLaunchHelper( CalculateXGrad(ctx, out_grad_data, x_data, - e_data, + y_data, out_grad.dims(), x_dims, - e_dims, + y_dims, d_index, s_index, compute_type, @@ -415,29 +415,29 @@ void GraphSendUERecvGradOpKernelLaunchHelper( out); CalculateEGrad(out_grad_data, x_data, - e_data, + y_data, x_dims, - e_dims, + y_dims, s_index, d_index, compute_type, pool_type, index_size, - e_grad_data, + y_grad_data, dst_count); } else if (pool_type == "MIN" || pool_type == "MAX") { CalculateXEGradForMinMax(out_grad_data, x_data, - e_data, + y_data, x_dims, - e_dims, + y_dims, d_index, s_index, compute_type, pool_type, index_size, x_grad_data, - e_grad_data, + y_grad_data, out); } } @@ -445,7 +445,7 @@ void GraphSendUERecvGradOpKernelLaunchHelper( template void GraphSendUERecvGradKernel(const Context& ctx, const DenseTensor& x, - const DenseTensor& e, + const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, const paddle::optional& out, @@ -454,20 +454,20 @@ void GraphSendUERecvGradKernel(const Context& ctx, const std::string& compute_type, const std::string& pool_type, DenseTensor* x_grad, - DenseTensor* e_grad) { + DenseTensor* y_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendUERecvGradOpKernelLaunchHelper( ctx, out_grad, x, - e, + y, src_index, dst_index, compute_type, pool_type, x_grad, - e_grad, + y_grad, dst_count.get_ptr(), out.get_ptr()); } else if (index_type == phi::DataType::INT64) { @@ -475,13 +475,13 @@ void GraphSendUERecvGradKernel(const Context& ctx, ctx, out_grad, x, - e, + y, src_index, dst_index, compute_type, pool_type, x_grad, - e_grad, + y_grad, dst_count.get_ptr(), out.get_ptr()); } diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc index 6f479c7deb3cf..5c3760657be86 100644 --- a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc @@ -29,7 +29,7 @@ namespace phi { template void GraphSendUERecvSumCpuKernel(const BroadCastInfo& bcast, const T* x_data, - const T* e_data, + const T* y_data, const IndexT* src_indices, const IndexT* dst_indices, T* output, @@ -43,11 +43,11 @@ void GraphSendUERecvSumCpuKernel(const BroadCastInfo& bcast, IndexT dst = dst_indices[i]; T* out_off = output + dst * bcast.out_len; const T* x_off = x_data + src * bcast.l_len; - const T* e_off = e_data + i * bcast.r_len; + const T* y_off = y_data + i * bcast.r_len; for (int64_t j = 0; j < bcast.out_len; j++) { int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; - int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - T val = cfunctor(x_off[x_add], e_off[e_add]); + int64_t y_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = cfunctor(x_off[x_add], y_off[y_add]); if (val != 0) { #ifdef PADDLE_WITH_MKLML #pragma omp atomic @@ -64,7 +64,7 @@ template void GraphSendUERecvMinMaxCpuKernel(const BroadCastInfo& bcast, const T* x_data, - const T* e_data, + const T* y_data, const IndexT* src_indices, const IndexT* dst_indices, T* output, @@ -80,17 +80,17 @@ void GraphSendUERecvMinMaxCpuKernel(const BroadCastInfo& bcast, IndexT dst = dst_indices[i]; T* out_off = output + dst * bcast.out_len; const T* x_off = x_data + src * bcast.l_len; - const T* e_off = e_data + i * bcast.r_len; + const T* y_off = y_data + i * bcast.r_len; bool in_set = existed_dst.find(dst) != existed_dst.end(); for (int64_t j = 0; j < bcast.out_len; j++) { int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; - int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - T val = cfunctor(x_off[x_add], e_off[e_add]); + int64_t y_add = bcast.use_bcast ? bcast.r_offset[j] : j; + T val = cfunctor(x_off[x_add], y_off[y_add]); #ifdef PADDLE_WITH_MKLML #pragma omp critical #endif if (!in_set) { - out_off[j] += val; + out_off[j] = val; } else { out_off[j] = pfunctor(out_off[j], val); } @@ -107,7 +107,7 @@ void GraphSendUERecvMinMaxCpuKernel(const BroadCastInfo& bcast, template void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, const DenseTensor& x, - const DenseTensor& e, + const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& compute_type, @@ -135,9 +135,9 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, memset(out_data, 0, memset_bytes); if (index_size == 0) return; - const auto& bcast_info = phi::CalcBCastInfo(x.dims(), e.dims()); + const auto& bcast_info = phi::CalcBCastInfo(x.dims(), y.dims()); const T* x_data = x.data(); - const T* e_data = e.data(); + const T* y_data = y.data(); const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index.data(); if (pool_type == "SUM" || pool_type == "MEAN") { @@ -145,7 +145,7 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, GraphAddFunctor add_functor; GraphSendUERecvSumCpuKernel>(bcast_info, x_data, - e_data, + y_data, s_index, d_index, out_data, @@ -155,7 +155,7 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, GraphMulFunctor mul_functor; GraphSendUERecvSumCpuKernel>(bcast_info, x_data, - e_data, + y_data, s_index, d_index, out_data, @@ -187,7 +187,7 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, GraphAddFunctor, GraphMinFunctor>(bcast_info, x_data, - e_data, + y_data, s_index, d_index, out_data, @@ -201,7 +201,7 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, GraphMulFunctor, GraphMinFunctor>(bcast_info, x_data, - e_data, + y_data, s_index, d_index, out_data, @@ -218,7 +218,7 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, GraphAddFunctor, GraphMaxFunctor>(bcast_info, x_data, - e_data, + y_data, s_index, d_index, out_data, @@ -232,7 +232,7 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, GraphMulFunctor, GraphMaxFunctor>(bcast_info, x_data, - e_data, + y_data, s_index, d_index, out_data, @@ -246,7 +246,7 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, template void GraphSendUERecvKernel(const Context& ctx, const DenseTensor& x, - const DenseTensor& e, + const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& compute_type, @@ -259,7 +259,7 @@ void GraphSendUERecvKernel(const Context& ctx, if (index_type == phi::DataType::INT32) { GraphSendUERecvOpKernelLaunchHelper(ctx, x, - e, + y, src_index, dst_index, compute_type, @@ -270,7 +270,7 @@ void GraphSendUERecvKernel(const Context& ctx, } else if (index_type == phi::DataType::INT64) { GraphSendUERecvOpKernelLaunchHelper(ctx, x, - e, + y, src_index, dst_index, compute_type, diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index 473fd04494238..c11f8c123c40f 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -16,9 +16,6 @@ #include #include -#include -#include - #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/hostdevice.h" diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py index 414cc6c639714..b001b4d56b9c0 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py @@ -1,5 +1,6 @@ # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# +# Copyright 2022 The DGL team. + # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py index 7279d9d031f90..f5bf1850fd581 100644 --- a/python/paddle/geometric/message_passing/send_recv.py +++ b/python/paddle/geometric/message_passing/send_recv.py @@ -41,7 +41,7 @@ def send_u_recv(x, Given: - X = [[0, 2, 3], + x = [[0, 2, 3], [1, 4, 5], [2, 6, 7]] @@ -55,7 +55,7 @@ def send_u_recv(x, Then: - Out = [[0, 2, 3], + out = [[0, 2, 3], [2, 8, 10], [1, 4, 5]] @@ -176,7 +176,7 @@ def send_ue_recv(x, This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` - to gather the corresponding data, after computing with `y` in different compute types, then use `dst_index` to + to gather the corresponding data, after computing with `y` in different compute types like add/sub/mul/div, then use `dst_index` to update the corresponding position of output tensor in different pooling types, like sum, mean, max, or min. Besides, we can use `out_size` to set necessary output shape. @@ -184,11 +184,11 @@ def send_ue_recv(x, Given: - X = [[0, 2, 3], + x = [[0, 2, 3], [1, 4, 5], [2, 6, 7]] - Y = [1, 1, 1] + y = [1, 1, 1] src_index = [0, 1, 2, 0] @@ -202,7 +202,7 @@ def send_ue_recv(x, Then: - Out = [[1, 3, 4], + out = [[1, 3, 4], [4, 10, 12], [2, 5, 6]] Args: From 36d1eab27774c82a22d7d1ebf7e402b7a7071d69 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 11 Aug 2022 06:07:37 +0000 Subject: [PATCH 48/51] fix thread block size --- .../phi/kernels/gpu/graph_send_ue_recv_funcs.h | 5 ++--- .../gpu/graph_send_ue_recv_grad_kernel.cu | 16 ++++++++-------- .../phi/kernels/gpu/graph_send_ue_recv_kernel.cu | 4 ++-- .../geometric/message_passing/send_recv.py | 8 ++++++-- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index c11f8c123c40f..776f04ea85156 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -23,8 +23,6 @@ namespace phi { -#define CUDA_MAX_NUM_THREADS 1024 - inline void CopyBCastOff(const BroadCastInfo& bcast_info, thrust::device_vector& l_bcastoff, thrust::device_vector& r_bcastoff) { @@ -51,7 +49,7 @@ inline void CopyBCastOff(const BroadCastInfo& bcast_info, #endif } -inline int FindNumThreads(int dim, int max_num_threads = CUDA_MAX_NUM_THREADS) { +inline int FindNumThreads(int dim, int max_num_threads) { PADDLE_ENFORCE_GE(dim, 0, phi::errors::PreconditionNotMet( @@ -61,6 +59,7 @@ inline int FindNumThreads(int dim, int max_num_threads = CUDA_MAX_NUM_THREADS) { while (res > dim) { res = res >> 1; } + res = res <= 32 ? 32 : res; return res; } diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu index bdb4e327e9294..7d89a1bc7d82e 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu @@ -49,8 +49,8 @@ void CalculateXEGradForMinMax(const Context& ctx, } int64_t out_len = bcast_info.out_len; - const int ntx = FindNumThreads(out_len); - const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock()); + const int nty = ctx.GetMaxThreadsPerBlock() / ntx; const int nbx = (out_len + ntx - 1) / ntx; const int nby = (index_size + nty - 1) / nty; const dim3 grid(nbx, nby); @@ -180,8 +180,8 @@ void CalculateXGrad(const Context& ctx, CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); } int64_t out_len = bcast_info.out_len; - const int ntx = FindNumThreads(out_len); - const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock()); + const int nty = ctx.GetMaxThreadsPerBlock() / ntx; const int nbx = (out_len + ntx - 1) / ntx; const int nby = (index_size + nty - 1) / nty; const dim3 grid_(nbx, nby); @@ -303,8 +303,8 @@ void CalculateXGrad(const Context& ctx, CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); } int64_t out_len = bcast_info.out_len; - const int ntx = FindNumThreads(out_len); - const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock()); + const int nty = ctx.GetMaxThreadsPerBlock() / ntx; const int nbx = (out_len + ntx - 1) / ntx; const int nby = (index_size + nty - 1) / nty; const dim3 grid_(nbx, nby); @@ -389,8 +389,8 @@ void CalculateEGrad(const Context& ctx, CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff); } int64_t out_len = bcast_info.out_len; - const int ntx = FindNumThreads(out_len); - const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock()); + const int nty = ctx.GetMaxThreadsPerBlock() / ntx; const int nbx = (out_len + ntx - 1) / ntx; const int nby = (index_size + nty - 1) / nty; const dim3 grid(nbx, nby); diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu index 4c83bd02473a1..28e304266dabd 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu @@ -92,8 +92,8 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, } int64_t out_len = bcast_info.out_len; - const int ntx = FindNumThreads(out_len); // 一个block包含的Thread数 - const int nty = CUDA_MAX_NUM_THREADS / ntx; + const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock()); + const int nty = ctx.GetMaxThreadsPerBlock() / ntx; const int nbx = (out_len + ntx - 1) / ntx; const int nby = (index_size + nty - 1) / nty; const dim3 grid(nbx, nby); diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py index f5bf1850fd581..ae7ec1a0f2ad9 100644 --- a/python/paddle/geometric/message_passing/send_recv.py +++ b/python/paddle/geometric/message_passing/send_recv.py @@ -61,6 +61,7 @@ def send_u_recv(x, Args: x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64. + And we support float16 in gpu version. src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. The available data type is int32, int64. @@ -127,8 +128,9 @@ def send_u_recv(x, return _C_ops.final_state_graph_send_recv(x, src_index, dst_index, pool_type.upper(), out_size) - check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"), - "graph_send_recv") + check_variable_and_dtype( + x, "X", ("float32", "float64", "int32", "int64", "float16"), + "graph_send_recv") check_variable_and_dtype(src_index, "Src_index", ("int32", "int64"), "graph_send_recv") check_variable_and_dtype(dst_index, "Dst_index", ("int32", "int64"), @@ -207,7 +209,9 @@ def send_ue_recv(x, [2, 5, 6]] Args: x (Tensor): The input node feature tensor, and the available data type is float32, float64, int32, int64. + And we support float16 in gpu version. y (Tensor): The input edge feature tensor, and the available data type is float32, float64, int32, int64. + And we support float16 in gpu version. src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. The available data type is int32, int64. From 50bf7da6310f90ae74cfac2853f2b94673d03204 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 11 Aug 2022 06:14:02 +0000 Subject: [PATCH 49/51] fix thread block size --- paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h index 776f04ea85156..49b48b5397538 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h @@ -54,8 +54,8 @@ inline int FindNumThreads(int dim, int max_num_threads) { 0, phi::errors::PreconditionNotMet( "Required dim >= 0, but received dim = %d", dim)); - if (dim == 0) return 1; int res = max_num_threads; + if (dim == 0) res = 1; while (res > dim) { res = res >> 1; } From e7cbc9fbc9049522c171a430c1989ebeda8302e3 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 11 Aug 2022 13:22:57 +0000 Subject: [PATCH 50/51] change api attribute name: pool_type to reduce_op, compute_type to message_op --- .../unittests/test_graph_send_recv_op.py | 4 +- .../unittests/test_graph_send_ue_recv_op.py | 110 +++++++++--------- .../geometric/message_passing/send_recv.py | 77 ++++++------ 3 files changed, 94 insertions(+), 97 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py index 73c1525519066..1b7d8213e75ac 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py @@ -25,11 +25,11 @@ def graph_send_recv_wrapper(x, src_index, dst_index, - pool_type="sum", + reduce_op="sum", out_size=None, name=None): return paddle.geometric.send_u_recv(x, src_index, dst_index, - pool_type.lower(), out_size, name) + reduce_op.lower(), out_size, name) class TestGraphSendRecvMaxOp(OpTest): diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py index b001b4d56b9c0..25f4d3cb660f0 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py @@ -103,7 +103,7 @@ def compute_graph_send_ue_recv_for_sum(inputs, attributes): y = inputs['Y'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] - compute_type = attributes['compute_type'] + message_op = attributes['compute_type'] gather_x = x[src_index] out_shp = [ @@ -112,9 +112,9 @@ def compute_graph_send_ue_recv_for_sum(inputs, attributes): results = np.zeros(out_shp, dtype=x.dtype) # Calculate forward output. - if compute_type == 'ADD': + if message_op == 'ADD': x_compute_y = gather_x + y - elif compute_type == 'MUL': + elif message_op == 'MUL': x_compute_y = gather_x * y for index, s_id in enumerate(dst_index): results[s_id, :] += x_compute_y[index, :] @@ -126,7 +126,7 @@ def compute_graph_send_ue_recv_for_mean(inputs, attributes): y = inputs['Y'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] - compute_type = attributes['compute_type'] + message_op = attributes['compute_type'] gather_x = x[src_index] out_shp = [ @@ -135,9 +135,9 @@ def compute_graph_send_ue_recv_for_mean(inputs, attributes): results = np.zeros(out_shp, dtype=x.dtype) # Calculate forward output. - if compute_type == 'ADD': + if message_op == 'ADD': x_compute_y = gather_x + y - elif compute_type == 'MUL': + elif message_op == 'MUL': x_compute_y = gather_x * y count = np.zeros(out_shp[0], dtype=np.int32) for index, s_id in enumerate(dst_index): @@ -155,8 +155,8 @@ def compute_graph_send_ue_recv_for_max_min(inputs, attributes): y = inputs['Y'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] - compute_type = attributes['compute_type'] - pool_type = attributes['pool_type'] + message_op = attributes['compute_type'] + reduce_op = attributes['pool_type'] gather_x = x[src_index] out_shp = [ @@ -165,13 +165,13 @@ def compute_graph_send_ue_recv_for_max_min(inputs, attributes): results = np.zeros(out_shp, dtype=x.dtype) # Calculate forward output. - if compute_type == 'ADD': + if message_op == 'ADD': x_compute_y = gather_x + y - elif compute_type == 'MUL': + elif message_op == 'MUL': x_compute_y = gather_x * y first_set = set() - if pool_type == 'MAX': + if reduce_op == 'MAX': for index, s_id in enumerate(dst_index): if s_id not in first_set: results[s_id, :] += x_compute_y[index, :] @@ -179,7 +179,7 @@ def compute_graph_send_ue_recv_for_max_min(inputs, attributes): else: results[s_id, :] = np.maximum(results[s_id, :], x_compute_y[index, :]) - elif pool_type == 'MIN': + elif reduce_op == 'MIN': for index, s_id in enumerate(dst_index): if s_id not in first_set: results[s_id, :] += x_compute_y[index, :] @@ -188,7 +188,7 @@ def compute_graph_send_ue_recv_for_max_min(inputs, attributes): results[s_id, :] = np.minimum(results[s_id, :], x_compute_y[index, :]) else: - raise ValueError("Invalid pool_type, only MAX, MIN supported!") + raise ValueError("Invalid reduce_op, only MAX, MIN supported!") # Calculate backward gradient. x_gradient = np.zeros_like(x) @@ -206,7 +206,7 @@ def compute_graph_send_ue_recv_for_max_min(inputs, attributes): for j in range(bcast_info.out_len): x_add = bcast_info.lhs_offset[j] if use_broadcast else j y_add = bcast_info.rhs_offset[j] if use_broadcast else j - if compute_type == 'ADD': + if message_op == 'ADD': if len(x_off.shape) == 1 and len(y_off.shape) == 1: val = x_off[x_add] + y_off[y_add] x_grad_off[x_add] += 1 * (val == out_off[j]) @@ -224,7 +224,7 @@ def compute_graph_send_ue_recv_for_max_min(inputs, attributes): val == out_off[out_add_0][out_add_1]) y_grad_off[y_add_0][y_add_1] += 1 * ( val == out_off[out_add_0][out_add_1]) - elif compute_type == 'MUL': + elif message_op == 'MUL': if len(x_off.shape) == 1 and len(y_off.shape) == 1: val = x_off[x_add] * y_off[y_add] x_grad_off[x_add] += 1 * (val == out_off[j]) * y_off[y_add] @@ -254,13 +254,13 @@ def graph_send_ue_recv_wrapper(x, y, src_index, dst_index, - compute_type="add", - pool_type="sum", + message_op="add", + reduce_op="sum", out_size=None, name=None): return paddle.geometric.send_ue_recv(x, y, src_index, dst_index, - compute_type.lower(), - pool_type.lower(), out_size, name) + message_op.lower(), reduce_op.lower(), + out_size, name) class TestGraphSendUERecvSumOp(OpTest): @@ -277,7 +277,7 @@ def setUp(self): 'Src_index': self.src_index, 'Dst_index': self.dst_index } - self.attrs = {'compute_type': self.compute_type, 'pool_type': 'SUM'} + self.attrs = {'compute_type': self.message_op, 'pool_type': 'SUM'} out = compute_graph_send_ue_recv_for_sum(self.inputs, self.attrs) @@ -289,7 +289,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' def test_check_output(self): self.check_output(check_eager=True) @@ -306,7 +306,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestSumCase2(TestGraphSendUERecvSumOp): @@ -317,7 +317,7 @@ def set_config(self): index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestSumCase3(TestGraphSendUERecvSumOp): @@ -328,7 +328,7 @@ def set_config(self): index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestSumCase4(TestGraphSendUERecvSumOp): @@ -339,7 +339,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestSumCase5(TestGraphSendUERecvSumOp): @@ -350,7 +350,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestSumCase6(TestGraphSendUERecvSumOp): @@ -361,7 +361,7 @@ def set_config(self): index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestSumCase7(TestGraphSendUERecvSumOp): @@ -372,7 +372,7 @@ def set_config(self): index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestGraphSendUERecvMeanOp(OpTest): @@ -389,7 +389,7 @@ def setUp(self): 'Src_index': self.src_index, 'Dst_index': self.dst_index } - self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MEAN'} + self.attrs = {'compute_type': self.message_op, 'pool_type': 'MEAN'} out, dst_count = compute_graph_send_ue_recv_for_mean( self.inputs, self.attrs) @@ -402,7 +402,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' def test_check_output(self): self.check_output(check_eager=True) @@ -419,7 +419,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestMeanCase2(TestGraphSendUERecvMeanOp): @@ -430,7 +430,7 @@ def set_config(self): index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestMeanCase3(TestGraphSendUERecvMeanOp): @@ -441,7 +441,7 @@ def set_config(self): index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestMeanCase4(TestGraphSendUERecvMeanOp): @@ -452,7 +452,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestMeanCase5(TestGraphSendUERecvMeanOp): @@ -463,7 +463,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestMeanCase6(TestGraphSendUERecvMeanOp): @@ -474,7 +474,7 @@ def set_config(self): index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestMeanCase7(TestGraphSendUERecvMeanOp): @@ -485,7 +485,7 @@ def set_config(self): index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestGraphSendUERecvMaxOp(OpTest): @@ -502,7 +502,7 @@ def setUp(self): 'Src_index': self.src_index, 'Dst_index': self.dst_index } - self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MAX'} + self.attrs = {'compute_type': self.message_op, 'pool_type': 'MAX'} out, self.gradients = compute_graph_send_ue_recv_for_max_min( self.inputs, self.attrs) @@ -515,7 +515,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' def test_check_output(self): self.check_output(check_eager=True) @@ -535,7 +535,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestMaxCase2(TestGraphSendUERecvMaxOp): @@ -546,7 +546,7 @@ def set_config(self): index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestMaxCase3(TestGraphSendUERecvMaxOp): @@ -557,7 +557,7 @@ def set_config(self): index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestMaxCase4(TestGraphSendUERecvMaxOp): @@ -568,7 +568,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestMaxCase5(TestGraphSendUERecvMaxOp): @@ -579,7 +579,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestMaxCase6(TestGraphSendUERecvMaxOp): @@ -590,7 +590,7 @@ def set_config(self): index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestMaxCase7(TestGraphSendUERecvMaxOp): @@ -601,7 +601,7 @@ def set_config(self): index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestGraphSendUERecvMinOp(OpTest): @@ -618,7 +618,7 @@ def setUp(self): 'Src_index': self.src_index, 'Dst_index': self.dst_index } - self.attrs = {'compute_type': self.compute_type, 'pool_type': 'MIN'} + self.attrs = {'compute_type': self.message_op, 'pool_type': 'MIN'} out, self.gradients = compute_graph_send_ue_recv_for_max_min( self.inputs, self.attrs) @@ -631,7 +631,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' def test_check_output(self): self.check_output(check_eager=True) @@ -651,7 +651,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestMinCase2(TestGraphSendUERecvMinOp): @@ -662,7 +662,7 @@ def set_config(self): index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestMinCase3(TestGraphSendUERecvMinOp): @@ -673,7 +673,7 @@ def set_config(self): index = np.random.randint(0, 10, (150, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestMinCase4(TestGraphSendUERecvMinOp): @@ -684,7 +684,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestMinCase5(TestGraphSendUERecvMinOp): @@ -695,7 +695,7 @@ def set_config(self): index = np.random.randint(0, 10, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class TestMinCase6(TestGraphSendUERecvMinOp): @@ -706,7 +706,7 @@ def set_config(self): index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'ADD' + self.message_op = 'ADD' class TestMinCase7(TestGraphSendUERecvMinOp): @@ -717,7 +717,7 @@ def set_config(self): index = np.random.randint(0, 100, (15, 2)).astype(np.int64) self.src_index = index[:, 0] self.dst_index = index[:, 1] - self.compute_type = 'MUL' + self.message_op = 'MUL' class API_GeometricSendUERecvTest(unittest.TestCase): diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py index ae7ec1a0f2ad9..cebd927566c97 100644 --- a/python/paddle/geometric/message_passing/send_recv.py +++ b/python/paddle/geometric/message_passing/send_recv.py @@ -25,7 +25,7 @@ def send_u_recv(x, src_index, dst_index, - pool_type="sum", + reduce_op="sum", out_size=None, name=None): """ @@ -35,7 +35,7 @@ def send_u_recv(x, This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor - in different pooling types, like sum, mean, max, or min. Besides, we can use `out_size` to set necessary output shape. + in different reduce ops, like sum, mean, max, or min. Besides, we can use `out_size` to set necessary output shape. .. code-block:: text @@ -49,7 +49,7 @@ def send_u_recv(x, dst_index = [1, 2, 1, 0] - pool_type = "sum" + reduce_op = "sum" out_size = None @@ -65,7 +65,7 @@ def send_u_recv(x, src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. The available data type is int32, int64. - pool_type (str): Different pooling types, including `sum`, `mean`, `max`, `min`. + reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`. Default value is `sum`. out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or out_size is smaller or equal to 0, then this input will not be used. @@ -89,7 +89,7 @@ def send_u_recv(x, indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] - out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum") + out = paddle.geometric.send_u_recv(x, src_index, dst_index, reduce_op="sum") # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]] x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") @@ -97,22 +97,22 @@ def send_u_recv(x, src_index = indexes[:, 0] dst_index = indexes[:, 1] out_size = paddle.max(dst_index) + 1 - out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum", out_size=out_size) + out = paddle.geometric.send_u_recv(x, src_index, dst_index, reduce_op="sum", out_size=out_size) # Outputs: [[0., 2., 3.], [[2., 8., 10.]]] x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] - out = paddle.geometric.send_u_recv(x, src_index, dst_index, pool_type="sum") + out = paddle.geometric.send_u_recv(x, src_index, dst_index, reduce_op="sum") # Outputs: [[0., 2., 3.], [2., 8., 10.], [0., 0., 0.]] """ - if pool_type not in ["sum", "mean", "max", "min"]: + if reduce_op not in ["sum", "mean", "max", "min"]: raise ValueError( - "pool_type should be `sum`, `mean`, `max` or `min`, but received %s" - % pool_type) + "reduce_op should be `sum`, `mean`, `max` or `min`, but received %s" + % reduce_op) # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1. @@ -120,13 +120,13 @@ def send_u_recv(x, out_size = convert_out_size_to_list(out_size) out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index, None, 'pool_type', - pool_type.upper(), 'out_size', + reduce_op.upper(), 'out_size', out_size) return out if in_dygraph_mode(): out_size = convert_out_size_to_list(out_size) return _C_ops.final_state_graph_send_recv(x, src_index, dst_index, - pool_type.upper(), out_size) + reduce_op.upper(), out_size) check_variable_and_dtype( x, "X", ("float32", "float64", "int32", "int64", "float16"), @@ -148,7 +148,7 @@ def send_u_recv(x, stop_gradient=True) inputs = {"X": x, "Src_index": src_index, "Dst_index": dst_index} - attrs = {"pool_type": pool_type.upper()} + attrs = {"pool_type": reduce_op.upper()} get_out_size_tensor_inputs(inputs=inputs, attrs=attrs, out_size=out_size, @@ -168,8 +168,8 @@ def send_ue_recv(x, y, src_index, dst_index, - compute_type="add", - pool_type="sum", + message_op="add", + reduce_op="sum", out_size=None, name=None): """ @@ -196,9 +196,9 @@ def send_ue_recv(x, dst_index = [1, 2, 1, 0] - compute_type = "add" + message_op = "add" - pool_type = "sum" + reduce_op = "sum" out_size = None @@ -215,8 +215,8 @@ def send_ue_recv(x, src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. The available data type is int32, int64. - compute_type (str): Different compute types for x and e, including `add`, `sub`, `mul`, `div`. - pool_type (str): Different pooling types, including `sum`, `mean`, `max`, `min`. + message_op (str): Different compute types for x and e, including `add`, `sub`, `mul`, `div`. + reduce_op (str): Different pooling types, including `sum`, `mean`, `max`, `min`. Default value is `sum`. out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or out_size is smaller or equal to 0, then this input will not be used. @@ -241,7 +241,7 @@ def send_ue_recv(x, indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] - out = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, compute_type="add", pool_type="sum") + out = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, message_op="add", reduce_op="sum") # Outputs: [[1., 3., 4.], [4., 10., 12.], [2., 5., 6.]] x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") @@ -250,7 +250,7 @@ def send_ue_recv(x, src_index = indexes[:, 0] dst_index = indexes[:, 1] out_size = paddle.max(dst_index) + 1 - out = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, compute_type="add", pool_type="sum", out_size=out_size) + out = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, message_op="add", reduce_op="sum", out_size=out_size) # Outputs: [[1., 3., 4.], [[4., 10., 12.]]] x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") @@ -258,28 +258,28 @@ def send_ue_recv(x, indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] - out = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, compute_type="add", pool_type="sum") + out = paddle.geometric.send_ue_recv(x, y, src_index, dst_index, message_op="add", reduce_op="sum") # Outputs: [[1., 3., 4.], [4., 10., 12.], [0., 0., 0.]] """ - if compute_type not in ["add", "sub", "mul", "div"]: + if message_op not in ["add", "sub", "mul", "div"]: raise ValueError( - "compute_type should be `add`, `sub`, `mul`, `div`, but received %s" - % compute_type) + "message_op should be `add`, `sub`, `mul`, `div`, but received %s" % + message_op) - if pool_type not in ["sum", "mean", "max", "min"]: + if reduce_op not in ["sum", "mean", "max", "min"]: raise ValueError( - "pool_type should be `sum`, `mean`, `max` or `min`, but received %s" - % pool_type) + "reduce_op should be `sum`, `mean`, `max` or `min`, but received %s" + % reduce_op) x, y = reshape_lhs_rhs(x, y) - if compute_type == 'sub': - compute_type = 'add' + if message_op == 'sub': + message_op = 'add' y = -y - if compute_type == "div": - compute_type = 'mul' + if message_op == "div": + message_op = 'mul' y = 1. / y # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1. @@ -288,15 +288,15 @@ def send_ue_recv(x, out_size = convert_out_size_to_list(out_size) out, tmp = _C_ops.graph_send_ue_recv(x, y, src_index, dst_index, None, 'compute_type', - compute_type.upper(), 'pool_type', - pool_type.upper(), 'out_size', + message_op.upper(), 'pool_type', + reduce_op.upper(), 'out_size', out_size) return out if in_dygraph_mode(): out_size = convert_out_size_to_list(out_size) return _C_ops.final_state_graph_send_ue_recv(x, y, src_index, dst_index, - compute_type.upper(), - pool_type.upper(), + message_op.upper(), + reduce_op.upper(), out_size) check_variable_and_dtype( @@ -322,10 +322,7 @@ def send_ue_recv(x, stop_gradient=True) inputs = {"X": x, "Y": y, "Src_index": src_index, "Dst_index": dst_index} - attrs = { - "compute_type": compute_type.upper(), - "pool_type": pool_type.upper() - } + attrs = {"compute_type": message_op.upper(), "pool_type": reduce_op.upper()} get_out_size_tensor_inputs(inputs=inputs, attrs=attrs, out_size=out_size, From 2b0bd9ae1ee8597900dc164fd419fc505760db65 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 11 Aug 2022 14:14:20 +0000 Subject: [PATCH 51/51] change api attribute name, move pool_type to reduce_op, move compute_type to message_op --- paddle/fluid/operators/graph_send_recv_op.cc | 12 ++-- .../fluid/operators/graph_send_ue_recv_op.cc | 16 ++--- paddle/phi/api/yaml/legacy_api.yaml | 4 +- paddle/phi/api/yaml/legacy_backward.yaml | 8 +-- paddle/phi/infermeta/multiary.cc | 6 +- paddle/phi/infermeta/multiary.h | 4 +- paddle/phi/infermeta/ternary.cc | 4 +- paddle/phi/infermeta/ternary.h | 2 +- .../cpu/graph_send_recv_grad_kernel.cc | 28 ++++---- .../phi/kernels/cpu/graph_send_recv_kernel.cc | 32 ++++----- .../cpu/graph_send_ue_recv_grad_kernel.cc | 72 +++++++++---------- .../kernels/cpu/graph_send_ue_recv_kernel.cc | 36 +++++----- .../gpu/graph_send_recv_grad_kernel.cu | 14 ++-- .../phi/kernels/gpu/graph_send_recv_kernel.cu | 22 +++--- .../gpu/graph_send_ue_recv_grad_kernel.cu | 72 +++++++++---------- .../kernels/gpu/graph_send_ue_recv_kernel.cu | 42 +++++------ .../phi/kernels/graph_send_recv_grad_kernel.h | 2 +- paddle/phi/kernels/graph_send_recv_kernel.h | 2 +- .../kernels/graph_send_ue_recv_grad_kernel.h | 4 +- .../phi/kernels/graph_send_ue_recv_kernel.h | 4 +- paddle/phi/ops/compat/graph_send_recv_sig.cc | 6 +- .../phi/ops/compat/graph_send_ue_recv_sig.cc | 6 +- .../unittests/test_graph_send_recv_op.py | 24 +++---- .../unittests/test_graph_send_ue_recv_op.py | 16 ++--- .../geometric/message_passing/send_recv.py | 18 ++--- .../incubate/operators/graph_send_recv.py | 6 +- 26 files changed, 231 insertions(+), 231 deletions(-) diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index e9ba861c3b88b..b954ecab704b4 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -64,9 +64,9 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker { .AsDispensable(); AddOutput("Out", "Output tensor of graph_send_recv op."); AddOutput("Dst_count", - "Count tensor of Dst_index, mainly for MEAN pool_type.") + "Count tensor of Dst_index, mainly for MEAN reduce_op.") .AsIntermediate(); - AddAttr("pool_type", + AddAttr("reduce_op", "(string, default 'SUM')" "Define different pool types to receive the result " "tensors of Dst_index.") @@ -81,7 +81,7 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Graph Learning Send_Recv combine operator. -$Out = Recv(Send(X, Src_index), Dst_index, pool_type)$ +$Out = Recv(Send(X, Src_index), Dst_index, reduce_op)$ This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. @@ -105,12 +105,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker { op->SetInput("Dst_index", this->Input("Dst_index")); op->SetInput("X", this->Input("X")); - if (PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") { + if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") { op->SetInput("Dst_count", this->Output("Dst_count")); } - if (PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" || - PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") { + if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" || + PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") { op->SetInput("Out", this->Output("Out")); } diff --git a/paddle/fluid/operators/graph_send_ue_recv_op.cc b/paddle/fluid/operators/graph_send_ue_recv_op.cc index 696b2656a7052..af16609df3ebd 100644 --- a/paddle/fluid/operators/graph_send_ue_recv_op.cc +++ b/paddle/fluid/operators/graph_send_ue_recv_op.cc @@ -68,14 +68,14 @@ class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker { .AsDispensable(); AddOutput("Out", "Output tensor of graph_send_ue_recv op."); AddOutput("Dst_count", - "Count tensor of Dst_index, mainly for MEAN pool_type.") + "Count tensor of Dst_index, mainly for MEAN reduce_op.") .AsIntermediate(); - AddAttr("compute_type", + AddAttr("message_op", "(string, default 'ADD')" "Define differenct computation types between X and E.") .SetDefault("ADD") .InEnum({"ADD", "MUL"}); - AddAttr("pool_type", + AddAttr("reduce_op", "(string, default 'SUM')" "Define different pool types to receive the result " "tensors of Dst_index.") @@ -90,13 +90,13 @@ class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Graph Learning Send_UE_Recv combine operator. -$Out = Recv(Compute(Send(X, Src_index), Y, compute_type), Dst_index, pool_type)$ +$Out = Recv(Compute(Send(X, Src_index), Y, message_op), Dst_index, reduce_op)$ This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `X` as the input tensor, we first use `src_index` to gather corresponding data. -Then the gather data should compute with `Y` in different compute_types, like add, sub, mul, and div, +Then the gather data should compute with `Y` in different message_ops, like add, sub, mul, and div, and get the computation result. Then, use `dst_index` to update the corresponding position of output tensor in different pooling types, like sum, mean, max, or min. @@ -117,12 +117,12 @@ class GraphSendUERecvGradOpMaker : public framework::SingleGradOpMaker { op->SetInput("Src_index", this->Input("Src_index")); op->SetInput("Dst_index", this->Input("Dst_index")); - if (PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") { + if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") { op->SetInput("Dst_count", this->Output("Dst_count")); } - if (PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" || - PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") { + if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" || + PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") { op->SetInput("Out", this->Output("Out")); } diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_api.yaml index c7c2f6f0152f9..4f2ea4a6b6655 100755 --- a/paddle/phi/api/yaml/legacy_api.yaml +++ b/paddle/phi/api/yaml/legacy_api.yaml @@ -1060,7 +1060,7 @@ func : generate_proposals_v2 - api : graph_send_recv - args : (Tensor x, Tensor src_index, Tensor dst_index, str pool_type = "SUM", IntArray out_size = {0}) + args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) output : Tensor(out), Tensor(dst_count) infer_meta : func : GraphSendRecvInferMeta @@ -1071,7 +1071,7 @@ backward : graph_send_recv_grad - api : graph_send_ue_recv - args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str compute_type, str pool_type, IntArray out_size) + args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) output : Tensor(out), Tensor(dst_count) infer_meta : func : GraphSendUERecvInferMeta diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 31ddba838f512..ba9f306faecc4 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -941,8 +941,8 @@ func : gelu_grad - backward_api : graph_send_recv_grad - forward : graph_send_recv (Tensor x, Tensor src_index, Tensor dst_index, str pool_type = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count) - args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str pool_type = "SUM") + forward : graph_send_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count) + args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM") output : Tensor(x_grad) infer_meta : func : GeneralUnaryGradInferMeta @@ -953,8 +953,8 @@ optional: out, dst_count - backward_api : graph_send_ue_recv_grad - forward : graph_send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str compute_type, str pool_type, IntArray out_size) -> Tensor(out), Tensor(dst_count) - args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str compute_type, str pool_type) + forward : graph_send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) -> Tensor(out), Tensor(dst_count) + args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op) output : Tensor(x_grad), Tensor(y_grad) infer_meta : func : GeneralBinaryGradInferMeta diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 53076d1a5d127..7ccd52bb6ff39 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2603,8 +2603,8 @@ void GraphSendUERecvInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& src_index, const MetaTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, const IntArray& out_size, MetaTensor* out, MetaTensor* dst_count) { @@ -2658,7 +2658,7 @@ void GraphSendUERecvInferMeta(const MetaTensor& x, y_dims[0])); auto x_dims = x.dims(); - if (pool_type == "MEAN") { + if (reduce_op == "MEAN") { dst_count->set_dims({-1}); dst_count->set_dtype(DataType::INT32); } diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 66d8ad84a4378..660121b844d10 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -470,8 +470,8 @@ void GraphSendUERecvInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& src_index, const MetaTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, const IntArray& out_size, MetaTensor* out, MetaTensor* dst_count); diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index a919a955a541a..342c9e4602309 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -411,7 +411,7 @@ void InstanceNormInferMeta(const MetaTensor& x, void GraphSendRecvInferMeta(const MetaTensor& x, const MetaTensor& src_index, const MetaTensor& dst_index, - const std::string& pool_type, + const std::string& reduce_op, const IntArray& out_size, MetaTensor* out, MetaTensor* dst_count) { @@ -460,7 +460,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x, out->set_dims(phi::make_ddim(dims_)); out->set_dtype(x.dtype()); - if (pool_type == "MEAN") { + if (reduce_op == "MEAN") { dst_count->set_dims({-1}); dst_count->set_dtype(DataType::INT32); } diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 466bd3df5de2d..5314b8f45affe 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -75,7 +75,7 @@ void InstanceNormInferMeta(const MetaTensor& x, void GraphSendRecvInferMeta(const MetaTensor& x, const MetaTensor& src_index, const MetaTensor& dst_index, - const std::string& pool_type, + const std::string& reduce_op, const IntArray& out_size, MetaTensor* out, MetaTensor* dst_count); diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc index ad04bd258e141..d4131a1ffb5e3 100644 --- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc @@ -29,10 +29,10 @@ void GraphSendRecvCpuGradLoop(const int& index_size, const DenseTensor& src, const DenseTensor& input, DenseTensor* dst, - const std::string& pool_type, + const std::string& reduce_op, const int* dst_count = nullptr, const DenseTensor* output = nullptr) { - if (pool_type == "SUM") { + if (reduce_op == "SUM") { Functor functor; for (int i = 0; i < index_size; ++i) { const IndexT& src_idx = s_index[i]; @@ -40,7 +40,7 @@ void GraphSendRecvCpuGradLoop(const int& index_size, ElementwiseInnerOperation( src, dst, src_idx, dst_idx, false, functor); } - } else if (pool_type == "MEAN") { + } else if (reduce_op == "MEAN") { for (int i = 0; i < index_size; ++i) { const IndexT& src_idx = s_index[i]; const IndexT& dst_idx = d_index[i]; @@ -50,7 +50,7 @@ void GraphSendRecvCpuGradLoop(const int& index_size, auto eigen_dst = phi::EigenVector::Flatten(dst_slice); eigen_dst += (eigen_src / static_cast(dst_count[src_idx])); } - } else if (pool_type == "MIN" || pool_type == "MAX") { + } else if (reduce_op == "MIN" || reduce_op == "MAX") { for (int i = 0; i < index_size; ++i) { const IndexT& forward_src_idx = d_index[i]; const IndexT& forward_dst_idx = s_index[i]; @@ -75,7 +75,7 @@ void GraphSendRecvGradOpKernelLaunchHelper( const DenseTensor& x, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& pool_type, + const std::string& reduce_op, DenseTensor* x_grad, const DenseTensor* dst_count = nullptr, const DenseTensor* out = nullptr) { @@ -94,15 +94,15 @@ void GraphSendRecvGradOpKernelLaunchHelper( const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index.data(); - if (pool_type == "SUM") { + if (reduce_op == "SUM") { GraphSendRecvCpuGradLoop>( - index_size, d_index, s_index, out_grad, x, x_grad, pool_type); - } else if (pool_type == "MEAN") { + index_size, d_index, s_index, out_grad, x, x_grad, reduce_op); + } else if (reduce_op == "MEAN") { const int* s_count = dst_count->data(); // Functor not used here. GraphSendRecvCpuGradLoop>( - index_size, d_index, s_index, out_grad, x, x_grad, pool_type, s_count); - } else if (pool_type == "MIN" || pool_type == "MAX") { + index_size, d_index, s_index, out_grad, x, x_grad, reduce_op, s_count); + } else if (reduce_op == "MIN" || reduce_op == "MAX") { // Functor not used here. GraphSendRecvCpuGradLoop>(index_size, d_index, @@ -110,7 +110,7 @@ void GraphSendRecvGradOpKernelLaunchHelper( out_grad, x, x_grad, - pool_type, + reduce_op, nullptr, out); } @@ -124,7 +124,7 @@ void GraphSendRecvGradKernel(const Context& ctx, const paddle::optional& out, const paddle::optional& dst_count, const DenseTensor& out_grad, - const std::string& pool_type, + const std::string& reduce_op, DenseTensor* x_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { @@ -134,7 +134,7 @@ void GraphSendRecvGradKernel(const Context& ctx, x, src_index, dst_index, - pool_type, + reduce_op, x_grad, dst_count.get_ptr(), out.get_ptr()); @@ -145,7 +145,7 @@ void GraphSendRecvGradKernel(const Context& ctx, x, src_index, dst_index, - pool_type, + reduce_op, x_grad, dst_count.get_ptr(), out.get_ptr()); diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc index d4b9c8c60e3f8..7985a65a20053 100644 --- a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc @@ -32,17 +32,17 @@ void GraphSendRecvCpuLoop(const int& input_size, const IndexT* d_index, const DenseTensor& src, DenseTensor* dst, - const std::string& pool_type, + const std::string& reduce_op, int* dst_count = nullptr) { Functor functor; - if (pool_type == "SUM") { + if (reduce_op == "SUM") { for (int i = 0; i < index_size; ++i) { const IndexT& src_idx = s_index[i]; const IndexT& dst_idx = d_index[i]; ElementwiseInnerOperation( src, dst, src_idx, dst_idx, false, functor); } - } else if (pool_type == "MEAN") { + } else if (reduce_op == "MEAN") { for (int i = 0; i < index_size; ++i) { const IndexT& src_idx = s_index[i]; const IndexT& dst_idx = d_index[i]; @@ -59,7 +59,7 @@ void GraphSendRecvCpuLoop(const int& input_size, auto eigen_dst = phi::EigenVector::Flatten(dst_slice); eigen_dst = eigen_dst / static_cast(*(dst_count + i)); } - } else if (pool_type == "MIN" || pool_type == "MAX") { + } else if (reduce_op == "MIN" || reduce_op == "MAX") { std::set existed_dst; for (int i = 0; i < index_size; ++i) { const IndexT& src_idx = s_index[i]; @@ -82,7 +82,7 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx, const DenseTensor& x, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& pool_type, + const std::string& reduce_op, int64_t out_size, DenseTensor* out, DenseTensor* dst_count = nullptr) { @@ -117,16 +117,16 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx, const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index.data(); - if (pool_type == "SUM") { + if (reduce_op == "SUM") { GraphSendRecvCpuLoop>( - src_dims[0], index_size, s_index, d_index, x, out, pool_type); - } else if (pool_type == "MIN") { + src_dims[0], index_size, s_index, d_index, x, out, reduce_op); + } else if (reduce_op == "MIN") { GraphSendRecvCpuLoop>( - src_dims[0], index_size, s_index, d_index, x, out, pool_type); - } else if (pool_type == "MAX") { + src_dims[0], index_size, s_index, d_index, x, out, reduce_op); + } else if (reduce_op == "MAX") { GraphSendRecvCpuLoop>( - src_dims[0], index_size, s_index, d_index, x, out, pool_type); - } else if (pool_type == "MEAN") { + src_dims[0], index_size, s_index, d_index, x, out, reduce_op); + } else if (reduce_op == "MEAN") { int64_t input_size = out_size <= 0 ? src_dims[0] : out_size; dst_count->Resize({input_size}); ctx.template Alloc(dst_count); @@ -138,7 +138,7 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx, d_index, x, out, - pool_type, + reduce_op, p_dst_count); } } @@ -148,7 +148,7 @@ void GraphSendRecvKernel(const Context& ctx, const DenseTensor& x, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& pool_type, + const std::string& reduce_op, const IntArray& out_size, DenseTensor* out, DenseTensor* dst_count) { @@ -159,7 +159,7 @@ void GraphSendRecvKernel(const Context& ctx, x, src_index, dst_index, - pool_type, + reduce_op, out_size_data[0], out, dst_count); @@ -168,7 +168,7 @@ void GraphSendRecvKernel(const Context& ctx, x, src_index, dst_index, - pool_type, + reduce_op, out_size_data[0], out, dst_count); diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc index c3ae8563370f8..95fdc6ff0a9cc 100644 --- a/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc @@ -39,8 +39,8 @@ void CalculateXGrad(const Context& ctx, const phi::DDim& e_dims, const IndexT* s_index, const IndexT* d_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, int64_t index_size, T* x_grad, const DenseTensor& out_grad_tensor, @@ -50,8 +50,8 @@ void CalculateXGrad(const Context& ctx, std::vector reduce_idx; bool reduce = ReduceGrad(out_grad_dims, x_dims, reduce_idx); - if (pool_type == "SUM") { - if (compute_type == "ADD") { + if (reduce_op == "SUM") { + if (message_op == "ADD") { GraphSendRecvSumFunctor sum_functor; if (!reduce) { for (int64_t i = 0; i < index_size; i++) { @@ -78,7 +78,7 @@ void CalculateXGrad(const Context& ctx, true); memcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T)); } - } else if (compute_type == "MUL") { + } else if (message_op == "MUL") { const auto& bcast = phi::CalcBCastInfo(out_grad_dims, e_dims); if (!reduce) { #ifdef PADDLE_WITH_MKLML @@ -137,9 +137,9 @@ void CalculateXGrad(const Context& ctx, memcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T)); } } - } else if (pool_type == "MEAN") { + } else if (reduce_op == "MEAN") { const int* s_count = dst_count->data(); - if (compute_type == "ADD") { + if (message_op == "ADD") { if (!reduce) { for (int64_t i = 0; i < index_size; i++) { IndexT src = s_index[i]; @@ -171,7 +171,7 @@ void CalculateXGrad(const Context& ctx, true); memcpy(x_grad, x_grad_out.data(), x_grad_out.numel() * sizeof(T)); } - } else if (compute_type == "MUL") { + } else if (message_op == "MUL") { const auto& bcast = phi::CalcBCastInfo(out_grad_dims, e_dims); if (!reduce) { #ifdef PADDLE_WITH_MKLML @@ -237,13 +237,13 @@ void CalculateEGrad(const T* out_grad_data, const phi::DDim& e_dims, const IndexT* s_index, const IndexT* d_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, int64_t index_size, T* e_grad, const DenseTensor* dst_count = nullptr) { const auto& bcast = phi::CalcBCastInfo(x_dims, e_dims); - if (pool_type == "SUM") { + if (reduce_op == "SUM") { #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif @@ -256,12 +256,12 @@ void CalculateEGrad(const T* out_grad_data, for (int64_t j = 0; j < bcast.out_len; j++) { int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - if (compute_type == "ADD") { + if (message_op == "ADD") { #ifdef PADDLE_WITH_MKLML #pragma omp atomic #endif e_grad_off[e_add] += out_grad_off[j]; - } else if (compute_type == "MUL") { + } else if (message_op == "MUL") { #ifdef PADDLE_WITH_MKLML #pragma omp atomic #endif @@ -269,7 +269,7 @@ void CalculateEGrad(const T* out_grad_data, } } } - } else if (pool_type == "MEAN") { + } else if (reduce_op == "MEAN") { const int* s_count = dst_count->data(); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for @@ -283,12 +283,12 @@ void CalculateEGrad(const T* out_grad_data, for (int64_t j = 0; j < bcast.out_len; j++) { int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - if (compute_type == "ADD") { + if (message_op == "ADD") { #ifdef PADDLE_WITH_MKLML #pragma omp atomic #endif e_grad_off[e_add] += (out_grad_off[j] / s_count[dst]); - } else if (compute_type == "MUL") { + } else if (message_op == "MUL") { #ifdef PADDLE_WITH_MKLML #pragma omp atomic #endif @@ -307,8 +307,8 @@ void CalculateXEGradForMinMax(const T* out_grad, const phi::DDim& e_dims, const IndexT* s_index, const IndexT* d_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, int64_t index_size, T* x_grad, T* e_grad, @@ -330,14 +330,14 @@ void CalculateXEGradForMinMax(const T* out_grad, for (int64_t j = 0; j < bcast.out_len; j++) { int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j; int64_t e_add = bcast.use_bcast ? bcast.r_offset[j] : j; - if (compute_type == "ADD") { + if (message_op == "ADD") { T val = x_off[x_add] + e_off[e_add]; #ifdef PADDLE_WITH_MKLML #pragma omp critical #endif x_grad_off[x_add] += (out_grad_off[j] * (val == out_off[j])); e_grad_off[e_add] += (out_grad_off[j] * (val == out_off[j])); - } else if (compute_type == "MUL") { + } else if (message_op == "MUL") { T val = x_off[x_add] * e_off[e_add]; #ifdef PADDLE_WITH_MKLML #pragma omp critical @@ -359,8 +359,8 @@ void GraphSendUERecvGradOpKernelLaunchHelper( const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, DenseTensor* x_grad, DenseTensor* y_grad, const DenseTensor* dst_count = nullptr, @@ -395,7 +395,7 @@ void GraphSendUERecvGradOpKernelLaunchHelper( const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index.data(); - if (pool_type == "SUM" || pool_type == "MEAN") { + if (reduce_op == "SUM" || reduce_op == "MEAN") { CalculateXGrad(ctx, out_grad_data, x_data, @@ -405,8 +405,8 @@ void GraphSendUERecvGradOpKernelLaunchHelper( y_dims, d_index, s_index, - compute_type, - pool_type, + message_op, + reduce_op, index_size, x_grad_data, out_grad, @@ -420,12 +420,12 @@ void GraphSendUERecvGradOpKernelLaunchHelper( y_dims, s_index, d_index, - compute_type, - pool_type, + message_op, + reduce_op, index_size, y_grad_data, dst_count); - } else if (pool_type == "MIN" || pool_type == "MAX") { + } else if (reduce_op == "MIN" || reduce_op == "MAX") { CalculateXEGradForMinMax(out_grad_data, x_data, y_data, @@ -433,8 +433,8 @@ void GraphSendUERecvGradOpKernelLaunchHelper( y_dims, d_index, s_index, - compute_type, - pool_type, + message_op, + reduce_op, index_size, x_grad_data, y_grad_data, @@ -451,8 +451,8 @@ void GraphSendUERecvGradKernel(const Context& ctx, const paddle::optional& out, const paddle::optional& dst_count, const DenseTensor& out_grad, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, DenseTensor* x_grad, DenseTensor* y_grad) { auto index_type = src_index.dtype(); @@ -464,8 +464,8 @@ void GraphSendUERecvGradKernel(const Context& ctx, y, src_index, dst_index, - compute_type, - pool_type, + message_op, + reduce_op, x_grad, y_grad, dst_count.get_ptr(), @@ -478,8 +478,8 @@ void GraphSendUERecvGradKernel(const Context& ctx, y, src_index, dst_index, - compute_type, - pool_type, + message_op, + reduce_op, x_grad, y_grad, dst_count.get_ptr(), diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc index 5c3760657be86..74fca002294db 100644 --- a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc @@ -110,8 +110,8 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, int64_t out_size, DenseTensor* out, DenseTensor* dst_count = nullptr) { @@ -140,8 +140,8 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, const T* y_data = y.data(); const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index.data(); - if (pool_type == "SUM" || pool_type == "MEAN") { - if (compute_type == "ADD") { + if (reduce_op == "SUM" || reduce_op == "MEAN") { + if (message_op == "ADD") { GraphAddFunctor add_functor; GraphSendUERecvSumCpuKernel>(bcast_info, x_data, @@ -151,7 +151,7 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, out_data, index_size, add_functor); - } else if (compute_type == "MUL") { + } else if (message_op == "MUL") { GraphMulFunctor mul_functor; GraphSendUERecvSumCpuKernel>(bcast_info, x_data, @@ -162,7 +162,7 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, index_size, mul_functor); } - if (pool_type == "MEAN") { + if (reduce_op == "MEAN") { int64_t input_size = out_size <= 0 ? x.dims()[0] : out_size; dst_count->Resize({input_size}); int* dst_count_data = ctx.template Alloc(dst_count); @@ -178,9 +178,9 @@ void GraphSendUERecvOpKernelLaunchHelper(const Context& ctx, eigen_out = eigen_out / static_cast(dst_count_data[i]); } } - } else if (pool_type == "MIN") { + } else if (reduce_op == "MIN") { GraphMinFunctor min_functor; - if (compute_type == "ADD") { + if (message_op == "ADD") { GraphAddFunctor add_functor; GraphSendUERecvMinMaxCpuKernel mul_functor; GraphSendUERecvMinMaxCpuKernel max_functor; - if (compute_type == "ADD") { + if (message_op == "ADD") { GraphAddFunctor add_functor; GraphSendUERecvMinMaxCpuKernel mul_functor; GraphSendUERecvMinMaxCpuKernel functor; GraphSendRecvCUDAKernel> <<>>( p_src, d_index, s_index, p_output, index_size, slice_size, functor); - } else if (pool_type == "MEAN") { + } else if (reduce_op == "MEAN") { const int32_t* s_count = dst_count->data(); ManipulateMeanGradCUDAKernel<<>>( p_src, d_index, s_index, p_output, index_size, slice_size, s_count); - } else if (pool_type == "MAX" || pool_type == "MIN") { + } else if (reduce_op == "MAX" || reduce_op == "MIN") { const T* ptr_input = x.data(); const T* ptr_output = out->data(); ManipulateMinMaxGradCUDAKernel @@ -105,7 +105,7 @@ void GraphSendRecvGradKernel(const Context& ctx, const paddle::optional& out, const paddle::optional& dst_count, const DenseTensor& out_grad, - const std::string& pool_type, + const std::string& reduce_op, DenseTensor* x_grad) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { @@ -115,7 +115,7 @@ void GraphSendRecvGradKernel(const Context& ctx, x, src_index, dst_index, - pool_type, + reduce_op, x_grad, dst_count.get_ptr(), out.get_ptr()); @@ -126,7 +126,7 @@ void GraphSendRecvGradKernel(const Context& ctx, x, src_index, dst_index, - pool_type, + reduce_op, x_grad, dst_count.get_ptr(), out.get_ptr()); diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu index e696960f800d0..055d4888e3f56 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu @@ -32,7 +32,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, const DenseTensor& x, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& pool_type, + const std::string& reduce_op, int64_t out_size, DenseTensor* out, DenseTensor* dst_count = nullptr) { @@ -59,19 +59,19 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, ctx.template Alloc(out); T* p_output = out->data(); const size_t& memset_bytes = memset_size * sizeof(T); - if (pool_type == "SUM" || pool_type == "MEAN") { + if (reduce_op == "SUM" || reduce_op == "MEAN") { #ifdef PADDLE_WITH_HIP hipMemset(p_output, 0, memset_bytes); #else cudaMemset(p_output, 0, memset_bytes); #endif - } else if (pool_type == "MAX") { + } else if (reduce_op == "MAX") { thrust::device_ptr p_output_ptr(p_output); thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, std::numeric_limits::lowest()); - } else if (pool_type == "MIN") { + } else if (reduce_op == "MIN") { thrust::device_ptr p_output_ptr(p_output); thrust::fill(thrust::device, p_output_ptr, @@ -99,12 +99,12 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, int64_t grid_tmp = (n + block - 1) / block; int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t input_size = out_size <= 0 ? src_dims[0] : out_size; - if (pool_type == "SUM") { + if (reduce_op == "SUM") { GraphSendRecvSumCUDAFunctor functor; GraphSendRecvCUDAKernel> <<>>( p_src, s_index, d_index, p_output, index_size, slice_size, functor); - } else if (pool_type == "MAX") { + } else if (reduce_op == "MAX") { GraphSendRecvMaxCUDAFunctor functor; GraphSendRecvCUDAKernel> <<>>( @@ -115,7 +115,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; InputResetMaxCUDAKernel<<>>( p_output, input_size, slice_size); - } else if (pool_type == "MIN") { + } else if (reduce_op == "MIN") { GraphSendRecvMinCUDAFunctor functor; GraphSendRecvCUDAKernel> <<>>( @@ -126,7 +126,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; InputResetMinCUDAKernel<<>>( p_output, input_size, slice_size); - } else if (pool_type == "MEAN") { + } else if (reduce_op == "MEAN") { GraphSendRecvSumCUDAFunctor functor; GraphSendRecvCUDAKernel> <<>>( @@ -158,7 +158,7 @@ void GraphSendRecvKernel(const Context& ctx, const DenseTensor& x, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& pool_type, + const std::string& reduce_op, const IntArray& out_size, DenseTensor* out, DenseTensor* dst_count) { @@ -169,7 +169,7 @@ void GraphSendRecvKernel(const Context& ctx, x, src_index, dst_index, - pool_type, + reduce_op, out_size_data[0], out, dst_count); @@ -178,7 +178,7 @@ void GraphSendRecvKernel(const Context& ctx, x, src_index, dst_index, - pool_type, + reduce_op, out_size_data[0], out, dst_count); diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu index 7d89a1bc7d82e..cb3d5591a7be6 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu @@ -35,8 +35,8 @@ void CalculateXEGradForMinMax(const Context& ctx, const phi::DDim& e_dims, const IndexT* s_index, const IndexT* d_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, int64_t index_size, T* x_grad, T* e_grad, @@ -56,7 +56,7 @@ void CalculateXEGradForMinMax(const Context& ctx, const dim3 grid(nbx, nby); const dim3 block(ntx, nty); - if (compute_type == "ADD") { + if (message_op == "ADD") { ManipulateMinMaxGradCUDAKernelForAdd <<>>( x_data, @@ -74,7 +74,7 @@ void CalculateXEGradForMinMax(const Context& ctx, bcast_info.r_len, out_len, bcast_info.use_bcast); - } else if (compute_type == "MUL") { + } else if (message_op == "MUL") { ManipulateMinMaxGradCUDAKernelForMul <<>>( x_data, @@ -105,8 +105,8 @@ void CalculateXGrad(const Context& ctx, const phi::DDim& e_dims, const IndexT* s_index, const IndexT* d_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, int64_t index_size, int64_t slice_size, T* x_grad, @@ -124,8 +124,8 @@ void CalculateXGrad(const Context& ctx, int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; std::vector reduce_idx; bool reduce = ReduceGrad(out_grad_dims, x_dims, reduce_idx); - if (pool_type == "SUM") { - if (compute_type == "ADD") { + if (reduce_op == "SUM") { + if (message_op == "ADD") { GraphSendRecvSumCUDAFunctor functor; if (!reduce) { GraphSendRecvCUDAKernel l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { @@ -251,9 +251,9 @@ void CalculateXGrad(const Context& ctx, #endif } } - } else if (pool_type == "MEAN") { + } else if (reduce_op == "MEAN") { const int* s_count = dst_count->data(); - if (compute_type == "ADD") { + if (message_op == "ADD") { if (!reduce) { ManipulateMeanGradCUDAKernel <<>>(out_grad, @@ -296,7 +296,7 @@ void CalculateXGrad(const Context& ctx, cudaMemcpyDeviceToDevice); #endif } - } else if (compute_type == "MUL") { + } else if (message_op == "MUL") { const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, e_dims); thrust::device_vector l_bcastoff, r_bcastoff; if (bcast_info.use_bcast) { @@ -378,8 +378,8 @@ void CalculateEGrad(const Context& ctx, const phi::DDim& e_dims, const IndexT* s_index, const IndexT* d_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, int64_t index_size, T* e_grad, const DenseTensor* dst_count = nullptr) { @@ -395,8 +395,8 @@ void CalculateEGrad(const Context& ctx, const int nby = (index_size + nty - 1) / nty; const dim3 grid(nbx, nby); const dim3 block(ntx, nty); - if (pool_type == "SUM") { - if (compute_type == "ADD") { + if (reduce_op == "SUM") { + if (message_op == "ADD") { ManipulateSumGradCUDAKernelForAddE <<>>( out_grad, @@ -407,7 +407,7 @@ void CalculateEGrad(const Context& ctx, bcast_info.r_len, out_len, bcast_info.use_bcast); - } else if (compute_type == "MUL") { + } else if (message_op == "MUL") { ManipulateSumGradCUDAKernelForMulE <<>>( x_data, @@ -423,9 +423,9 @@ void CalculateEGrad(const Context& ctx, out_len, bcast_info.use_bcast); } - } else if (pool_type == "MEAN") { + } else if (reduce_op == "MEAN") { const int* s_count = dst_count->data(); - if (compute_type == "ADD") { + if (message_op == "ADD") { ManipulateMeanGradCUDAKernelForAddE <<>>( out_grad, @@ -437,7 +437,7 @@ void CalculateEGrad(const Context& ctx, bcast_info.r_len, out_len, bcast_info.use_bcast); - } else if (compute_type == "MUL") { + } else if (message_op == "MUL") { ManipulateMeanGradCUDAKernelForMulE <<>>( x_data, @@ -465,8 +465,8 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper( const DenseTensor& e, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, DenseTensor* x_grad, DenseTensor* e_grad, const DenseTensor* dst_count = nullptr, @@ -506,7 +506,7 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper( const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index.data(); - if (pool_type == "SUM" || pool_type == "MEAN") { + if (reduce_op == "SUM" || reduce_op == "MEAN") { CalculateXGrad(ctx, out_grad_data, x_data, @@ -516,8 +516,8 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper( e_dims, s_index, d_index, - compute_type, - pool_type, + message_op, + reduce_op, index_size, slice_size, x_grad_data, @@ -532,12 +532,12 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper( e_dims, s_index, d_index, - compute_type, - pool_type, + message_op, + reduce_op, index_size, e_grad_data, dst_count); - } else if (pool_type == "MIN" || pool_type == "MAX") { + } else if (reduce_op == "MIN" || reduce_op == "MAX") { CalculateXEGradForMinMax(ctx, out_grad_data, x_data, @@ -546,8 +546,8 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper( e_dims, s_index, d_index, - compute_type, - pool_type, + message_op, + reduce_op, index_size, x_grad_data, e_grad_data, @@ -564,8 +564,8 @@ void GraphSendUERecvGradKernel(const Context& ctx, const paddle::optional& out, const paddle::optional& dst_count, const DenseTensor& out_grad, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, DenseTensor* x_grad, DenseTensor* y_grad) { auto index_type = src_index.dtype(); @@ -577,8 +577,8 @@ void GraphSendUERecvGradKernel(const Context& ctx, y, src_index, dst_index, - compute_type, - pool_type, + message_op, + reduce_op, x_grad, y_grad, dst_count.get_ptr(), @@ -591,8 +591,8 @@ void GraphSendUERecvGradKernel(const Context& ctx, y, src_index, dst_index, - compute_type, - pool_type, + message_op, + reduce_op, x_grad, y_grad, dst_count.get_ptr(), diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu index 28e304266dabd..f339387f0bbfc 100644 --- a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu @@ -35,8 +35,8 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, const DenseTensor& e, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, int64_t out_size, DenseTensor* out, DenseTensor* dst_count = nullptr) { @@ -57,20 +57,20 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, ctx.template Alloc(out); T* out_data = out->data(); const size_t& memset_bytes = memset_size * sizeof(T); - if (pool_type == "SUM" || pool_type == "MEAN") { + if (reduce_op == "SUM" || reduce_op == "MEAN") { #ifdef PADDLE_WITH_HIP hipMemset(out_data, 0, memset_bytes); #else cudaMemset(out_data, 0, memset_bytes); #endif - } else if (pool_type == "MAX") { + } else if (reduce_op == "MAX") { thrust::device_ptr out_data_ptr(out_data); thrust::fill(thrust::device, out_data_ptr, out_data_ptr + memset_size, std::numeric_limits::lowest()); - } else if (pool_type == "MIN") { + } else if (reduce_op == "MIN") { thrust::device_ptr out_data_ptr(out_data); thrust::fill(thrust::device, out_data_ptr, @@ -104,9 +104,9 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, #else int block_ = 1024; #endif - if (pool_type == "SUM" || pool_type == "MEAN") { + if (reduce_op == "SUM" || reduce_op == "MEAN") { GraphSendUERecvSumCUDAFunctor sum_functor; - if (compute_type == "ADD") { + if (message_op == "ADD") { funcs::AddFunctor add_funtor; GraphSendUERecvCUDAKernel mul_functor; GraphSendUERecvCUDAKernelResize({input_size}); ctx.template Alloc(dst_count); @@ -171,9 +171,9 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx, ManipulateMeanCUDAKernel<<>>( out_data, dst_count_data, input_size, out_len); } - } else if (pool_type == "MAX") { + } else if (reduce_op == "MAX") { GraphSendUERecvMaxCUDAFunctor max_functor; - if (compute_type == "ADD") { + if (message_op == "ADD") { funcs::AddFunctor add_funtor; GraphSendUERecvCUDAKernel mul_functor; GraphSendUERecvCUDAKernel <<>>(out_data, input_size, out_len); - } else if (pool_type == "MIN") { + } else if (reduce_op == "MIN") { GraphSendUERecvMinCUDAFunctor min_functor; - if (compute_type == "ADD") { + if (message_op == "ADD") { funcs::AddFunctor add_funtor; GraphSendUERecvCUDAKernel mul_functor; GraphSendUERecvCUDAKernel& out, const paddle::optional& dst_count, const DenseTensor& out_grad, - const std::string& pool_type, + const std::string& reduce_op, DenseTensor* x_grad); } // namespace phi diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h index cd625c92b93ea..023e86064ff51 100644 --- a/paddle/phi/kernels/graph_send_recv_kernel.h +++ b/paddle/phi/kernels/graph_send_recv_kernel.h @@ -26,7 +26,7 @@ void GraphSendRecvKernel(const Context& ctx, const DenseTensor& x, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& pool_type, + const std::string& reduce_op, const IntArray& out_size, DenseTensor* out, DenseTensor* dst_count); diff --git a/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h index f5c7ce9a8937e..74050d126259d 100644 --- a/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h +++ b/paddle/phi/kernels/graph_send_ue_recv_grad_kernel.h @@ -29,8 +29,8 @@ void GraphSendUERecvGradKernel(const Context& ctx, const paddle::optional& out, const paddle::optional& dst_count, const DenseTensor& out_grad, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, DenseTensor* x_grad, DenseTensor* y_grad); } // namespace phi diff --git a/paddle/phi/kernels/graph_send_ue_recv_kernel.h b/paddle/phi/kernels/graph_send_ue_recv_kernel.h index efb93ab47c93c..a308a78800f3a 100644 --- a/paddle/phi/kernels/graph_send_ue_recv_kernel.h +++ b/paddle/phi/kernels/graph_send_ue_recv_kernel.h @@ -26,8 +26,8 @@ void GraphSendUERecvKernel(const Context& ctx, const DenseTensor& y, const DenseTensor& src_index, const DenseTensor& dst_index, - const std::string& compute_type, - const std::string& pool_type, + const std::string& message_op, + const std::string& reduce_op, const IntArray& out_size, DenseTensor* out, DenseTensor* dst_count); diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc index c8c15619d5d39..0ca1a3fae0230 100644 --- a/paddle/phi/ops/compat/graph_send_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc @@ -21,12 +21,12 @@ KernelSignature GraphSendRecvOpArgumentMapping( if (ctx.HasInput("Out_size")) { return KernelSignature("graph_send_recv", {"X", "Src_index", "Dst_index"}, - {"pool_type", "Out_size"}, + {"reduce_op", "Out_size"}, {"Out", "Dst_count"}); } else { return KernelSignature("graph_send_recv", {"X", "Src_index", "Dst_index"}, - {"pool_type", "out_size"}, + {"reduce_op", "out_size"}, {"Out", "Dst_count"}); } } @@ -36,7 +36,7 @@ KernelSignature GraphSendRecvGradOpArgumentMapping( return KernelSignature( "graph_send_recv_grad", {"X", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, - {"pool_type"}, + {"reduce_op"}, {"X@GRAD"}); } diff --git a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc index a4cd6f4a150b1..0b2ddcc07e1bb 100644 --- a/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_ue_recv_sig.cc @@ -21,12 +21,12 @@ KernelSignature GraphSendUERecvOpArgumentMapping( if (ctx.HasInput("Out_size")) { return KernelSignature("graph_send_ue_recv", {"X", "Y", "Src_index", "Dst_index"}, - {"compute_type", "pool_type", "Out_size"}, + {"message_op", "reduce_op", "Out_size"}, {"Out", "Dst_count"}); } else { return KernelSignature("graph_send_ue_recv", {"X", "Y", "Src_index", "Dst_index"}, - {"compute_type", "pool_type", "out_size"}, + {"message_op", "reduce_op", "out_size"}, {"Out", "Dst_count"}); } } @@ -36,7 +36,7 @@ KernelSignature GraphSendUERecvGradOpArgumentMapping( return KernelSignature( "graph_send_ue_recv_grad", {"X", "Y", "Src_index", "Dst_index", "Out", "Dst_count", "Out@GRAD"}, - {"compute_type", "pool_type"}, + {"message_op", "reduce_op"}, {"X@GRAD", "Y@GRAD"}); } diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py index 1b7d8213e75ac..81fcf06167e13 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py @@ -46,7 +46,7 @@ def setUp(self): self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index} - self.attrs = {'pool_type': 'MAX'} + self.attrs = {'reduce_op': 'MAX'} out, self.gradient = compute_graph_send_recv_for_min_max( self.inputs, self.attrs) @@ -76,7 +76,7 @@ def setUp(self): self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index} - self.attrs = {'pool_type': 'MIN'} + self.attrs = {'reduce_op': 'MIN'} out, self.gradient = compute_graph_send_recv_for_min_max( self.inputs, self.attrs) @@ -107,7 +107,7 @@ def setUp(self): self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index} - self.attrs = {'pool_type': 'SUM'} + self.attrs = {'reduce_op': 'SUM'} out, _ = compute_graph_send_recv_for_sum_mean(self.inputs, self.attrs) @@ -134,7 +134,7 @@ def setUp(self): self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index} - self.attrs = {'pool_type': 'MEAN'} + self.attrs = {'reduce_op': 'MEAN'} out, dst_count = compute_graph_send_recv_for_sum_mean( self.inputs, self.attrs) @@ -153,15 +153,15 @@ def compute_graph_send_recv_for_sum_mean(inputs, attributes): src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] - pool_type = attributes['pool_type'] + reduce_op = attributes['reduce_op'] gather_x = x[src_index] target_shape = list(x.shape) results = np.zeros(target_shape, dtype=x.dtype) - if pool_type == 'SUM': + if reduce_op == 'SUM': for index, s_id in enumerate(dst_index): results[s_id, :] += gather_x[index, :] - elif pool_type == 'MEAN': + elif reduce_op == 'MEAN': count = np.zeros(target_shape[0], dtype=np.int32) for index, s_id in enumerate(dst_index): results[s_id, :] += gather_x[index, :] @@ -169,7 +169,7 @@ def compute_graph_send_recv_for_sum_mean(inputs, attributes): results = results / count.reshape([-1, 1]) results[np.isnan(results)] = 0 else: - raise ValueError("Invalid pool_type, only SUM, MEAN supported!") + raise ValueError("Invalid reduce_op, only SUM, MEAN supported!") count = np.zeros(target_shape[0], dtype=np.int32) for index, s_id in enumerate(dst_index): @@ -183,7 +183,7 @@ def compute_graph_send_recv_for_min_max(inputs, attributes): src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] - pool_type = attributes['pool_type'] + reduce_op = attributes['reduce_op'] gather_x = x[src_index] target_shape = list(x.shape) @@ -191,7 +191,7 @@ def compute_graph_send_recv_for_min_max(inputs, attributes): gradient = np.zeros_like(x) # Calculate forward output - if pool_type == "MAX": + if reduce_op == "MAX": first_set = set() for index, s_id in enumerate(dst_index): if s_id not in first_set: @@ -200,7 +200,7 @@ def compute_graph_send_recv_for_min_max(inputs, attributes): else: results[s_id, :] = np.maximum(results[s_id, :], gather_x[index, :]) - elif pool_type == "MIN": + elif reduce_op == "MIN": first_set = set() for index, s_id in enumerate(dst_index): if s_id not in first_set: @@ -210,7 +210,7 @@ def compute_graph_send_recv_for_min_max(inputs, attributes): results[s_id, :] = np.minimum(results[s_id, :], gather_x[index, :]) else: - raise ValueError("Invalid pool_type, only MAX, MIN supported!") + raise ValueError("Invalid reduce_op, only MAX, MIN supported!") # Calculate backward gradient index_size = len(src_index) diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py index 25f4d3cb660f0..e8b5bdc7bb8f8 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_ue_recv_op.py @@ -103,7 +103,7 @@ def compute_graph_send_ue_recv_for_sum(inputs, attributes): y = inputs['Y'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] - message_op = attributes['compute_type'] + message_op = attributes['message_op'] gather_x = x[src_index] out_shp = [ @@ -126,7 +126,7 @@ def compute_graph_send_ue_recv_for_mean(inputs, attributes): y = inputs['Y'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] - message_op = attributes['compute_type'] + message_op = attributes['message_op'] gather_x = x[src_index] out_shp = [ @@ -155,8 +155,8 @@ def compute_graph_send_ue_recv_for_max_min(inputs, attributes): y = inputs['Y'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] - message_op = attributes['compute_type'] - reduce_op = attributes['pool_type'] + message_op = attributes['message_op'] + reduce_op = attributes['reduce_op'] gather_x = x[src_index] out_shp = [ @@ -277,7 +277,7 @@ def setUp(self): 'Src_index': self.src_index, 'Dst_index': self.dst_index } - self.attrs = {'compute_type': self.message_op, 'pool_type': 'SUM'} + self.attrs = {'message_op': self.message_op, 'reduce_op': 'SUM'} out = compute_graph_send_ue_recv_for_sum(self.inputs, self.attrs) @@ -389,7 +389,7 @@ def setUp(self): 'Src_index': self.src_index, 'Dst_index': self.dst_index } - self.attrs = {'compute_type': self.message_op, 'pool_type': 'MEAN'} + self.attrs = {'message_op': self.message_op, 'reduce_op': 'MEAN'} out, dst_count = compute_graph_send_ue_recv_for_mean( self.inputs, self.attrs) @@ -502,7 +502,7 @@ def setUp(self): 'Src_index': self.src_index, 'Dst_index': self.dst_index } - self.attrs = {'compute_type': self.message_op, 'pool_type': 'MAX'} + self.attrs = {'message_op': self.message_op, 'reduce_op': 'MAX'} out, self.gradients = compute_graph_send_ue_recv_for_max_min( self.inputs, self.attrs) @@ -618,7 +618,7 @@ def setUp(self): 'Src_index': self.src_index, 'Dst_index': self.dst_index } - self.attrs = {'compute_type': self.message_op, 'pool_type': 'MIN'} + self.attrs = {'message_op': self.message_op, 'reduce_op': 'MIN'} out, self.gradients = compute_graph_send_ue_recv_for_max_min( self.inputs, self.attrs) diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py index cebd927566c97..bfe63f1f04d73 100644 --- a/python/paddle/geometric/message_passing/send_recv.py +++ b/python/paddle/geometric/message_passing/send_recv.py @@ -119,7 +119,7 @@ def send_u_recv(x, if _in_legacy_dygraph(): out_size = convert_out_size_to_list(out_size) out, tmp = _C_ops.graph_send_recv(x, src_index, - dst_index, None, 'pool_type', + dst_index, None, 'reduce_op', reduce_op.upper(), 'out_size', out_size) return out @@ -148,7 +148,7 @@ def send_u_recv(x, stop_gradient=True) inputs = {"X": x, "Src_index": src_index, "Dst_index": dst_index} - attrs = {"pool_type": reduce_op.upper()} + attrs = {"reduce_op": reduce_op.upper()} get_out_size_tensor_inputs(inputs=inputs, attrs=attrs, out_size=out_size, @@ -178,8 +178,8 @@ def send_ue_recv(x, This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` - to gather the corresponding data, after computing with `y` in different compute types like add/sub/mul/div, then use `dst_index` to - update the corresponding position of output tensor in different pooling types, like sum, mean, max, or min. + to gather the corresponding data, after computing with `y` in different message ops like add/sub/mul/div, then use `dst_index` to + update the corresponding position of output tensor in different reduce ops, like sum, mean, max, or min. Besides, we can use `out_size` to set necessary output shape. .. code-block:: text @@ -215,8 +215,8 @@ def send_ue_recv(x, src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. The available data type is int32, int64. - message_op (str): Different compute types for x and e, including `add`, `sub`, `mul`, `div`. - reduce_op (str): Different pooling types, including `sum`, `mean`, `max`, `min`. + message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`. + reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`. Default value is `sum`. out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or out_size is smaller or equal to 0, then this input will not be used. @@ -287,8 +287,8 @@ def send_ue_recv(x, if _in_legacy_dygraph(): out_size = convert_out_size_to_list(out_size) out, tmp = _C_ops.graph_send_ue_recv(x, y, src_index, dst_index, - None, 'compute_type', - message_op.upper(), 'pool_type', + None, 'message_op', + message_op.upper(), 'reduce_op', reduce_op.upper(), 'out_size', out_size) return out @@ -322,7 +322,7 @@ def send_ue_recv(x, stop_gradient=True) inputs = {"X": x, "Y": y, "Src_index": src_index, "Dst_index": dst_index} - attrs = {"compute_type": message_op.upper(), "pool_type": reduce_op.upper()} + attrs = {"message_op": message_op.upper(), "reduce_op": reduce_op.upper()} get_out_size_tensor_inputs(inputs=inputs, attrs=attrs, out_size=out_size, diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py index 132a6d4657ca1..4181885d419af 100644 --- a/python/paddle/incubate/operators/graph_send_recv.py +++ b/python/paddle/incubate/operators/graph_send_recv.py @@ -69,7 +69,7 @@ def graph_send_recv(x, src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. The available data type is int32, int64. - pool_type (str): The pooling type of graph_send_recv, including `sum`, `mean`, `max`, `min`. + pool_type (str): The pooling types of graph_send_recv, including `sum`, `mean`, `max`, `min`. Default value is `sum`. out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or out_size is smaller or equal to 0, then this input will not be used. @@ -123,7 +123,7 @@ def graph_send_recv(x, if _in_legacy_dygraph(): out_size = convert_out_size_to_list(out_size) out, tmp = _C_ops.graph_send_recv(x, src_index, - dst_index, None, 'pool_type', + dst_index, None, 'reduce_op', pool_type.upper(), 'out_size', out_size) return out @@ -151,7 +151,7 @@ def graph_send_recv(x, stop_gradient=True) inputs = {"X": x, "Src_index": src_index, "Dst_index": dst_index} - attrs = {"pool_type": pool_type.upper()} + attrs = {"reduce_op": pool_type.upper()} get_out_size_tensor_inputs(inputs=inputs, attrs=attrs, out_size=out_size,