Skip to content

Commit

Permalink
[geometric]Add paddle.geometric.send_ue_recv API (PaddlePaddle#43174)
Browse files Browse the repository at this point in the history
* add init file

* add op definition and infermeta

* add kernel definition funcs

* add broadcast infer shape

* add gpu forward kernel

* delete SUB and DIV

* add x_grad

* add template

* add e_grad for min and max

* fix small bug

* temp commit

* temp commit

* add e_grad for sum and mean

* fix some compile bug

* fix compile bugs

* fix compile problem

* add sum forward unittest

* fix broadcast error, add kernel sig, register e_grad, change unit test

* fix grad

* add temp grad fix

* temp commit

* add min max unittest

* add max, min unittest, fix mul bug

* add cpu forward sum and mean

* add forward min max, fix mean unittest

* add cpu backward min max

* fix code-style

* add backward sum mean

* fix rocm ci

* set uniitest timeout

* fix bug of x broadcast to e, gpu grad

* fix bug of x broadcast to e, cpu grad

* rename BOOST_GET_CONST macro

* fix rocm ci

* mv graph_send_e_recv to graph_send_ue_recv

* move out_size to IntArray

* add eager op test

* fix max pool type bug, add unittest for api

* revise api doc

* add fp16 for atomic min and max, add unittest

* add unittest

* add fp16 support for graph_send_recv

* fix unittest fp16 bug

* change OutSizeTensor to Out_size

* move E to Y

* add copyright, fix comment

* review code

* fix thread block size

* fix thread block size

* change api attribute name: pool_type to reduce_op, compute_type to message_op

* change api attribute name, move pool_type to reduce_op, move compute_type to message_op
  • Loading branch information
DesmonDay authored and root committed Oct 12, 2022
1 parent f33d447 commit 6e1e27f
Show file tree
Hide file tree
Showing 36 changed files with 4,185 additions and 144 deletions.
12 changes: 6 additions & 6 deletions paddle/fluid/operators/graph_send_recv_op.cc
Expand Up @@ -64,9 +64,9 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
.AsDispensable();
AddOutput("Out", "Output tensor of graph_send_recv op.");
AddOutput("Dst_count",
"Count tensor of Dst_index, mainly for MEAN pool_type.")
"Count tensor of Dst_index, mainly for MEAN reduce_op.")
.AsIntermediate();
AddAttr<std::string>("pool_type",
AddAttr<std::string>("reduce_op",
"(string, default 'SUM')"
"Define different pool types to receive the result "
"tensors of Dst_index.")
Expand All @@ -81,7 +81,7 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC(
Graph Learning Send_Recv combine operator.
$Out = Recv(Send(X, Src_index), Dst_index, pool_type)$
$Out = Recv(Send(X, Src_index), Dst_index, reduce_op)$
This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
intermediate memory consumption in the process of message passing.
Expand All @@ -105,12 +105,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
op->SetInput("Dst_index", this->Input("Dst_index"));
op->SetInput("X", this->Input("X"));

if (PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") {
if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") {
op->SetInput("Dst_count", this->Output("Dst_count"));
}

if (PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" ||
PADDLE_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") {
if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" ||
PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") {
op->SetInput("Out", this->Output("Out"));
}

Expand Down
150 changes: 150 additions & 0 deletions paddle/fluid/operators/graph_send_ue_recv_op.cc
@@ -0,0 +1,150 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"

namespace paddle {
namespace operators {

class GraphSendUERecvOP : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;

protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"),
ctx.device_context());
}
};

class GraphSendUERecvGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;

void InferShape(framework::InferShapeContext* ctx) const override {
auto in_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
auto y_dims = ctx->GetInputDim("Y");
ctx->SetOutputDim(framework::GradVarName("Y"), y_dims);
}

protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.device_context());
}
};

class GraphSendUERecvOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor with data type float32, float64, int32, int64.");
AddInput("Y",
"The input edge weight tensor, data type should be same with X");
AddInput("Src_index", "The source index tensor.");
AddInput("Dst_index", "The destination index tensor.");
AddInput("Out_size",
"(Tensor<int>, optional). The 0th dimension of the output."
"It has a higher priority than Attr(out_size).")
.AsDispensable();
AddOutput("Out", "Output tensor of graph_send_ue_recv op.");
AddOutput("Dst_count",
"Count tensor of Dst_index, mainly for MEAN reduce_op.")
.AsIntermediate();
AddAttr<std::string>("message_op",
"(string, default 'ADD')"
"Define differenct computation types between X and E.")
.SetDefault("ADD")
.InEnum({"ADD", "MUL"});
AddAttr<std::string>("reduce_op",
"(string, default 'SUM')"
"Define different pool types to receive the result "
"tensors of Dst_index.")
.SetDefault("SUM")
.InEnum({"SUM", "MEAN", "MIN", "MAX"});
AddAttr<std::vector<int64_t>>(
"out_size",
"(vector<int64_t>, default {0})"
"Define the first dimension of Output tensor."
"If set default {0}, then the shape of Out is the same with X.")
.SetDefault({0});
AddComment(R"DOC(
Graph Learning Send_UE_Recv combine operator.
$Out = Recv(Compute(Send(X, Src_index), Y, message_op), Dst_index, reduce_op)$
This operator is mainly used in Graph Learning domain, and the main purpose is to reduce
intermediate memory consumption in the process of message passing.
Take `X` as the input tensor, we first use `src_index` to gather corresponding data.
Then the gather data should compute with `Y` in different message_ops, like add, sub, mul, and div,
and get the computation result. Then, use `dst_index` to update the corresponding position of output
tensor in different pooling types, like sum, mean, max, or min.
)DOC");
}
};

template <typename T>
class GraphSendUERecvGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;

protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("graph_send_ue_recv_grad");
op->SetInput("X", this->Input("X"));
op->SetInput("Y", this->Input("Y"));
op->SetInput("Src_index", this->Input("Src_index"));
op->SetInput("Dst_index", this->Input("Dst_index"));

if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MEAN") {
op->SetInput("Dst_count", this->Output("Dst_count"));
}

if (PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MIN" ||
PADDLE_GET_CONST(std::string, this->GetAttr("reduce_op")) == "MAX") {
op->SetInput("Out", this->Output("Out"));
}

op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
op->SetAttrMap(this->Attrs());
}
};

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;

DECLARE_INFER_SHAPE_FUNCTOR(graph_send_ue_recv,
GraphSendUERecvInferShapeFunctor,
PD_INFER_META(phi::GraphSendUERecvInferMeta));
REGISTER_OPERATOR(graph_send_ue_recv,
ops::GraphSendUERecvOP,
ops::GraphSendUERecvOpMaker,
ops::GraphSendUERecvGradOpMaker<paddle::framework::OpDesc>,
ops::GraphSendUERecvGradOpMaker<paddle::imperative::OpBase>,
GraphSendUERecvInferShapeFunctor);
REGISTER_OPERATOR(graph_send_ue_recv_grad, ops::GraphSendUERecvGradOp);
98 changes: 98 additions & 0 deletions paddle/fluid/platform/device/gpu/gpu_primitives.h
Expand Up @@ -419,6 +419,55 @@ CUDA_ATOMIC_WRAPPER(Max, double) {
return __longlong_as_double(old);
}

#ifdef PADDLE_CUDA_FP16
inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) {
float16 low_half;
// The float16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<float16>(max(static_cast<float>(low_half), x));
return (val & 0xFFFF0000u) | low_half.x;
}

inline static __device__ uint32_t max_to_high_half(uint32_t val, float x) {
float16 high_half;
// The float16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<float16>(max(static_cast<float>(high_half), x));
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}

CUDA_ATOMIC_WRAPPER(Max, float16) {
if (*address >= val) {
return *address;
}
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// The float16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed, max_to_low_half(assumed, val_f));
} while (old != assumed);
float16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// The float16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed, max_to_high_half(assumed, val_f));
} while (old != assumed);
float16 ret;
ret.x = old >> 16;
return ret;
}
}
#endif

// For atomicMin
USE_CUDA_ATOMIC(Min, int);
USE_CUDA_ATOMIC(Min, unsigned int);
Expand Down Expand Up @@ -503,5 +552,54 @@ CUDA_ATOMIC_WRAPPER(Min, double) {
return __longlong_as_double(old);
}

#ifdef PADDLE_CUDA_FP16
inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) {
float16 low_half;
// The float16 in lower 16bits
low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
low_half = static_cast<float16>(min(static_cast<float>(low_half), x));
return (val & 0xFFFF0000u) | low_half.x;
}

inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) {
float16 high_half;
// The float16 in higher 16bits
high_half.x = static_cast<uint16_t>(val >> 16);
high_half = static_cast<float16>(min(static_cast<float>(high_half), x));
return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}

CUDA_ATOMIC_WRAPPER(Min, float16) {
if (*address <= val) {
return *address;
}
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// The float16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed, min_to_low_half(assumed, val_f));
} while (old != assumed);
float16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// The float16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(address_as_ui, assumed, min_to_high_half(assumed, val_f));
} while (old != assumed);
float16 ret;
ret.x = old >> 16;
return ret;
}
}
#endif

} // namespace platform
} // namespace paddle
1 change: 1 addition & 0 deletions paddle/fluid/pybind/op_function_generator.h
Expand Up @@ -226,6 +226,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
"Mean3",
"Var3"}},
{"graph_send_recv", {"X", "Src_index", "Dst_index", "Out_size"}},
{"graph_send_ue_recv", {"X", "Y", "Src_index", "Dst_index", "Out_size"}},
};

// NOTE(zhiqiu): Like op_ins_map.
Expand Down
13 changes: 12 additions & 1 deletion paddle/phi/api/yaml/legacy_api.yaml
Expand Up @@ -989,7 +989,7 @@
backward : gelu_grad

- api : graph_send_recv
args : (Tensor x, Tensor src_index, Tensor dst_index, str pool_type = "SUM", IntArray out_size = {0})
args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0})
output : Tensor(out), Tensor(dst_count)
infer_meta :
func : GraphSendRecvInferMeta
Expand All @@ -999,6 +999,17 @@
intermediate : dst_count
backward : graph_send_recv_grad

- api : graph_send_ue_recv
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size)
output : Tensor(out), Tensor(dst_count)
infer_meta :
func : GraphSendUERecvInferMeta
kernel :
func : graph_send_ue_recv
data_type : x
intermediate : dst_count
backward : graph_send_ue_recv_grad

- api : greater_equal
args : (Tensor x, Tensor y, int axis = -1)
output : Tensor
Expand Down
16 changes: 14 additions & 2 deletions paddle/phi/api/yaml/legacy_backward.yaml
Expand Up @@ -898,8 +898,8 @@
func : gelu_grad

- backward_api : graph_send_recv_grad
forward : graph_send_recv (Tensor x, Tensor src_index, Tensor dst_index, str pool_type = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str pool_type = "SUM")
forward : graph_send_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM")
output : Tensor(x_grad)
infer_meta :
func : GeneralUnaryGradInferMeta
Expand All @@ -909,6 +909,18 @@
data_type : out_grad
optional: out, dst_count

- backward_api : graph_send_ue_recv_grad
forward : graph_send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) -> Tensor(out), Tensor(dst_count)
args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : graph_send_ue_recv_grad
data_type : out_grad
optional: out, dst_count

# grid sample
- backward_api : grid_sample_grad
forward : grid_sample (Tensor x, Tensor grid, str mode, str padding_mode, bool align_corners) -> Tensor(out)
Expand Down

0 comments on commit 6e1e27f

Please sign in to comment.