Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Browse files Browse the repository at this point in the history
… nanmedian
  • Loading branch information
thunder95 committed May 26, 2022
2 parents 6744d0a + 6af32a7 commit 8021de0
Show file tree
Hide file tree
Showing 14 changed files with 1,012 additions and 611 deletions.
21 changes: 20 additions & 1 deletion paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu
Expand Up @@ -106,6 +106,23 @@ __global__ void dy_mf_search_kernel(Table* table,
for (int j = 0; j < cur->mf_dim + 1; ++j) {
cur->mf[j] = input.mf[j];
}
} else {
if (keys[i] != 0) {
printf("warning::pull miss key: %d", keys[i]);
}
FeatureValue* cur = (FeatureValue*)(vals + i * pull_feature_value_size);
cur->delta_score = 0;
cur->show = 0;
cur->clk = 0;
cur->slot = -1;
cur->lr = 0;
cur->lr_g2sum = 0;
cur->mf_size = 0;
cur->mf_dim = 8;
cur->cpu_ptr;
for (int j = 0; j < cur->mf_dim + 1; j++) {
cur->mf[j] = 0;
}
}
}
}
Expand Down Expand Up @@ -138,7 +155,9 @@ __global__ void dy_mf_update_kernel(Table* table,
FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur);
} else {
printf("warning: push miss key: %d", keys[i]);
if (keys[i] != 0) {
printf("warning::push miss key: %d", keys[i]);
}
}
}
}
Expand Down
3 changes: 0 additions & 3 deletions paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
Expand Up @@ -239,9 +239,6 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
VLOG(0) << "GpuPs task unique cost " << timeline.ElapsedSec() << " seconds.";
for (int i = 0; i < thread_keys_shard_num_; i++) {
for (int j = 0; j < multi_mf_dim_; j++) {
if (i == 0 && j == multi_mf_dim_ - 1) {
gpu_task->feature_dim_keys_[i][j].push_back(0);
}
VLOG(0) << "GpuPs shard: " << i << "mf dim: " << index_dim_vec_[j]
<< " key len: " << gpu_task->feature_dim_keys_[i][j].size();
gpu_task->value_dim_ptr_[i][j].resize(
Expand Down
6 changes: 4 additions & 2 deletions paddle/fluid/framework/ir/yolo_box_fuse_pass.cc
Expand Up @@ -199,9 +199,11 @@ void YoloBoxFusePass::ApplyImpl(ir::Graph* graph) const {
GET_IR_NODE(nms_out_rois_num);
#undef GET_IR_NODE

auto* block = yolo_box0->Op()->Block();

// create yolo_box_head
#define CREATE_YOLO_BOX_HEAD(idx_) \
framework::OpDesc yolo_box_head##idx_##_op_desc; \
framework::OpDesc yolo_box_head##idx_##_op_desc(block); \
yolo_box_head##idx_##_op_desc.SetType("yolo_box_head"); \
yolo_box_head##idx_##_op_desc.SetInput("X", \
{yolo_box##idx_##_in_x->Name()}); \
Expand All @@ -222,7 +224,7 @@ void YoloBoxFusePass::ApplyImpl(ir::Graph* graph) const {
#undef CREATE_YOLO_BOX_HEAD

// create yolo_box_post
framework::OpDesc yolo_box_post_op_desc;
framework::OpDesc yolo_box_post_op_desc(block);
yolo_box_post_op_desc.SetType("yolo_box_post");
yolo_box_post_op_desc.SetInput("Boxes0", {yolo_box0_out_boxes->Name()});
yolo_box_post_op_desc.SetInput("Boxes1", {yolo_box1_out_boxes->Name()});
Expand Down
220 changes: 0 additions & 220 deletions paddle/fluid/operators/instance_norm_op.cc
Expand Up @@ -170,104 +170,6 @@ NCHW `[batch, in_channels, in_height, in_width]`
)DOC");
}

template <typename T>
class InstanceNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));

const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();

const int N = x_dims[0];
const int C = x_dims[1];
const int NxC = N * C;

const int sample_size = x->numel() / N / C;

auto *y = ctx.Output<Tensor>("Y");
auto *saved_mean = ctx.Output<Tensor>("SavedMean");
auto *saved_variance = ctx.Output<Tensor>("SavedVariance");

auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
auto *place = dev_ctx.eigen_device();

Eigen::DSizes<int, 2> shape(NxC, sample_size);
// Once eigen on Windows is updated, the if branch can be removed.
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::DSizes<int, 2> bcast(1, sample_size);
Eigen::DSizes<int, 2> C_shape(C, 1);
Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
Eigen::DSizes<int, 1> rdims(1);
#else
Eigen::IndexList<Eigen::type2index<1>, int> bcast;
bcast.set(1, sample_size);
Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
C_shape.set(0, C);
Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
NxC_shape.set(0, NxC);
Eigen::IndexList<Eigen::type2index<1>> rdims;
#endif

phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;

saved_mean->mutable_data<T>(ctx.GetPlace());
saved_variance->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, saved_mean, static_cast<T>(0));
set_constant(dev_ctx, saved_variance, static_cast<T>(0));

auto saved_mean_a = framework::EigenVector<T>::Flatten(*saved_mean);
auto saved_mean_e = saved_mean_a.reshape(NxC_shape);
auto saved_variance_a = framework::EigenVector<T>::Flatten(*saved_variance);
auto saved_variance_e = saved_variance_a.reshape(NxC_shape);

auto x_e = framework::EigenVector<T>::Flatten(*x);
auto x_arr = x_e.reshape(shape);

saved_mean_e.device(*place) = x_arr.mean(rdims);
auto saved_variance_arr =
(x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon;

saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse();

const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");

Tensor scale_data;
Tensor bias_data;
if (!scale) {
scale_data.mutable_data<T>({C}, ctx.GetPlace());
set_constant(dev_ctx, &scale_data, static_cast<T>(1));
}

if (!bias) {
bias_data.mutable_data<T>({C}, ctx.GetPlace());
set_constant(dev_ctx, &bias_data, static_cast<T>(0));
}
auto scale_e = scale
? framework::EigenVector<T>::Flatten(*scale)
: framework::EigenVector<T>::Flatten(
const_cast<const framework::Tensor &>(scale_data));
auto scale_arr = scale_e.reshape(C_shape);
auto bias_e = bias ? framework::EigenVector<T>::Flatten(*bias)
: framework::EigenVector<T>::Flatten(
const_cast<const framework::Tensor &>(bias_data));
auto bias_arr = bias_e.reshape(C_shape);

y->mutable_data<T>(ctx.GetPlace());
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto y_arr = y_e.reshape(shape);

// (x - mean) * inv_std * scale + bias
Eigen::DSizes<int, 2> bcast_param(N, sample_size);
y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) *
saved_variance_e.broadcast(bcast) *
scale_arr.broadcast(bcast_param) +
bias_arr.broadcast(bcast_param);
}
};

void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormGrad");
OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
Expand Down Expand Up @@ -312,120 +214,6 @@ framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}

template <typename T>
class InstanceNormGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");

const auto &x_dims = x->dims();

const int N = x_dims[0];
const int C = x_dims[1];
const int NxC = N * C;
const int sample_size = x->numel() / N / C;

auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());

auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
auto *place = dev_ctx.eigen_device();

Eigen::DSizes<int, 2> rshape(NxC, sample_size);
Eigen::DSizes<int, 2> param_shape(N, C);
Eigen::DSizes<int, 2> shape(NxC, sample_size);
#ifndef EIGEN_HAS_INDEX_LIST
Eigen::DSizes<int, 1> rdims(0);
Eigen::DSizes<int, 1> mean_rdims(1);
Eigen::DSizes<int, 2> bcast(1, sample_size);
Eigen::DSizes<int, 2> C_shape(C, 1);
Eigen::DSizes<int, 2> NxC_shape(NxC, 1);
#else
Eigen::IndexList<Eigen::type2index<0>> rdims;
Eigen::IndexList<Eigen::type2index<1>> mean_rdims;
Eigen::IndexList<Eigen::type2index<1>, int> bcast;
bcast.set(1, sample_size);
Eigen::IndexList<int, Eigen::type2index<1>> C_shape;
C_shape.set(0, C);
Eigen::IndexList<int, Eigen::type2index<1>> NxC_shape;
NxC_shape.set(0, NxC);
#endif

phi::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;

Tensor scale_data;
if (!scale) {
scale_data.mutable_data<T>({C}, ctx.GetPlace());
set_constant(dev_ctx, &scale_data, static_cast<T>(1));
}

auto scale_e = scale
? framework::EigenVector<T>::Flatten(*scale)
: framework::EigenVector<T>::Flatten(
const_cast<const framework::Tensor &>(scale_data));
auto mean_e = framework::EigenVector<T>::Flatten(*saved_mean);
auto inv_var_e = framework::EigenVector<T>::Flatten(*saved_inv_variance);
auto dy_e = framework::EigenVector<T>::Flatten(*d_y);
auto x_e = framework::EigenVector<T>::Flatten(*x);

auto scale_arr = scale_e.reshape(C_shape);
auto mean_arr = mean_e.reshape(NxC_shape);
auto inv_var_arr = inv_var_e.reshape(NxC_shape);
auto dy_arr = dy_e.reshape(shape);
auto x_arr = x_e.reshape(shape);

auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) *
inv_var_arr.eval().broadcast(bcast);

// math: d_bias = np.sum(d_y, axis=(n,h,w))
// math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
if (d_scale && d_bias) {
d_scale->mutable_data<T>(ctx.GetPlace());
d_bias->mutable_data<T>(ctx.GetPlace());
set_constant(dev_ctx, d_scale, static_cast<T>(0));
set_constant(dev_ctx, d_bias, static_cast<T>(0));

auto d_scale_e = framework::EigenVector<T>::Flatten(*d_scale);
auto d_scale_data = d_scale_e.reshape(C_shape);
auto d_bias_e = framework::EigenVector<T>::Flatten(*d_bias);
auto d_bias_data = d_bias_e.reshape(C_shape);
d_bias_data.device(*place) =
dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
d_scale_data.device(*place) =
(tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims);
}

auto dy_mean =
dy_arr.mean(mean_rdims).reshape(NxC_shape).eval().broadcast(bcast);

Eigen::DSizes<int, 2> bcast_param(N, sample_size);
set_constant(dev_ctx, d_x, static_cast<T>(0));
// math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y,
// axis=(h,w))
// - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X -
// mean),
// axis=(h,w))
auto dx_e = framework::EigenVector<T>::Flatten(*d_x);
auto dx_arr = dx_e.reshape(shape);
dx_arr.device(*place) = scale_arr.broadcast(bcast_param) *
inv_var_arr.broadcast(bcast) *
(dy_arr - dy_mean -
tmp *
(dy_arr * tmp)
.mean(mean_rdims)
.reshape(NxC_shape)
.eval()
.broadcast(bcast));
}
};

void InstanceNormDoubleGradOp::InferShape(
framework::InferShapeContext *ctx) const {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormDoubleGrad");
Expand Down Expand Up @@ -699,14 +487,6 @@ REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp,
REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp,
ops::InstanceNormDoubleGradOpInplaceInferer);

REGISTER_OP_CPU_KERNEL(
instance_norm,
ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, float>,
ops::InstanceNormKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
instance_norm_grad,
ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::InstanceNormGradKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
instance_norm_grad_grad,
ops::InstanceNormDoubleGradKernel<paddle::platform::CPUDeviceContext,
Expand Down

1 comment on commit 8021de0

@paddle-bot-old
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Congratulation! Your pull request passed all required CI. You could ask reviewer(s) to approve and merge. 🎉

Please sign in to comment.