Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix cudnn error for BatchNorm1D kernel #43072

Merged
merged 16 commits into from Jun 21, 2022
266 changes: 153 additions & 113 deletions paddle/phi/kernels/gpu/batch_norm_kernel.cu
Expand Up @@ -446,90 +446,81 @@ void BatchNormKernel(const Context &ctx,
paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
} else {
double this_factor = 1. - momentum;

bool called = false;
#if CUDNN_VERSION_MIN(7, 4, 1)
called = true;
size_t workspace_size = 0;
size_t reserve_space_size = 0;
void *reserve_space_ptr = nullptr;
void *workspace_ptr = nullptr;
DenseTensor workspace_tensor;
DenseTensor reserve_space_tensor;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
// auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
if (reserve_space == nullptr) {
reserve_space = &reserve_space_tensor;
}
PADDLE_ENFORCE_NOT_NULL(
reserve_space,
phi::errors::NotFound(
"The argument ReserveSpace of batch_norm op is not found."));
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*zDesc=*/nullptr,
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));

// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*activationDesc=*/nullptr,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size));

reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
reserve_space_ptr =
static_cast<void *>(ctx.template Alloc<uint8_t>(reserve_space));
workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
workspace_ptr =
static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
handle,
mode_,
CUDNN_BATCHNORM_OPS_BN,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
nullptr,
nullptr,
data_desc_,
transformed_y.template data<T>(),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
this_factor,
ctx.template Alloc<BatchNormParamType<T>>(mean_out),
ctx.template Alloc<BatchNormParamType<T>>(variance_out),
epsilon,
ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
ctx.template Alloc<BatchNormParamType<T>>(saved_variance),
nullptr,
workspace_ptr,
workspace_size,
reserve_space_ptr,
reserve_space_size));
#endif // CUDNN_VERSION_MIN(7, 4, 1)
if (!called) {
#ifdef PADDLE_WITH_HIP
const int num = transformed_x.numel();
const int block = 256;
const int num = transformed_x.numel();
const int block = 256;
const int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks);
if (compute_format == DataLayout::kNCHW) {
BNForwardTraining<T, block, DataLayout::kNCHW>
<<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
this_factor,
transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
} else {
BNForwardTraining<T, block, DataLayout::kNHWC>
<<<grid, block, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
this_factor,
transformed_y.template data<T>(),
mean_out->template data<BatchNormParamType<T>>(),
variance_out->template data<BatchNormParamType<T>>(),
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
#else
// CUDNN PER_ACTIVATION mode only support small batch size
const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
const bool use_native_kernel =
(x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD);
if (use_native_kernel) {
const int block = 512;
const int max_threads = ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
const int grid = std::min(C, max_blocks);
Expand Down Expand Up @@ -566,35 +557,83 @@ void BatchNormKernel(const Context &ctx,
saved_mean->template data<BatchNormParamType<T>>(),
saved_variance->template data<BatchNormParamType<T>>());
}
// TODO(wangran16): wait for MIOpen to improve the performance of BN
// PADDLE_ENFORCE_GPU_SUCCESS(
// platform::dynload::miopenBatchNormalizationForwardTraining(
// handle, mode_, const_cast<void *>(static_cast<const void *>(
// CudnnDataType<T>::kOne())),
// const_cast<void *>(
// static_cast<const void *>(CudnnDataType<T>::kZero())),
// data_desc_,
// static_cast<const void *>(transformed_x.template data<T>()),
// data_desc_,
// static_cast<void *>(
// transformed_y.template mutable_data<T>(ctx.GetPlace())),
// bn_param_desc_,
// const_cast<void *>(static_cast<const void *>(
// scale->template data<BatchNormParamType<T>>())),
// const_cast<void *>(static_cast<const void *>(
// bias->template data<BatchNormParamType<T>>())),
// this_factor,
// static_cast<void *>(
// mean_out->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(variance_out->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace())),
// epsilon,
// static_cast<void *>(
// saved_mean->template mutable_data<BatchNormParamType<T>>(
// ctx.GetPlace())),
// static_cast<void *>(saved_variance->template mutable_data<
// BatchNormParamType<T>>(ctx.GetPlace()))));
} else {
#if CUDNN_VERSION_MIN(7, 4, 1)
size_t workspace_size = 0;
size_t reserve_space_size = 0;
void *reserve_space_ptr = nullptr;
void *workspace_ptr = nullptr;
DenseTensor workspace_tensor;
DenseTensor reserve_space_tensor;
// Create reserve space and workspace for batch norm.
// Create tensor for each batchnorm op, it will be used in the
// backward. Thus this tensor shouldn't be temp.
// auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
if (reserve_space == nullptr) {
reserve_space = &reserve_space_tensor;
}
PADDLE_ENFORCE_NOT_NULL(
reserve_space,
phi::errors::NotFound(
"The argument ReserveSpace of batch_norm op is not found."));
// --------------- cudnn batchnorm workspace ---------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
/*xDesc=*/data_desc_,
/*zDesc=*/nullptr,
/*yDesc=*/data_desc_,
/*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
/*activationDesc=*/nullptr,
/*sizeInBytes=*/&workspace_size));

// -------------- cudnn batchnorm reserve space --------------
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::
cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
/*handle=*/handle,
/*mode=*/mode_,
/*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
/*activationDesc=*/nullptr,
/*xDesc=*/data_desc_,
/*sizeInBytes=*/&reserve_space_size));

reserve_space->Resize({static_cast<int64_t>(reserve_space_size)});
reserve_space_ptr =
static_cast<void *>(ctx.template Alloc<uint8_t>(reserve_space));
workspace_tensor.Resize({static_cast<int64_t>(workspace_size)});
workspace_ptr =
static_cast<void *>(ctx.template Alloc<uint8_t>(&workspace_tensor));
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
handle,
mode_,
CUDNN_BATCHNORM_OPS_BN,
CudnnDataType<T>::kOne(),
CudnnDataType<T>::kZero(),
data_desc_,
transformed_x.template data<T>(),
nullptr,
nullptr,
data_desc_,
transformed_y.template data<T>(),
bn_param_desc_,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
this_factor,
ctx.template Alloc<BatchNormParamType<T>>(mean_out),
ctx.template Alloc<BatchNormParamType<T>>(variance_out),
epsilon,
ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
ctx.template Alloc<BatchNormParamType<T>>(saved_variance),
nullptr,
workspace_ptr,
workspace_size,
reserve_space_ptr,
reserve_space_size));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
Expand All @@ -615,8 +654,9 @@ void BatchNormKernel(const Context &ctx,
epsilon,
ctx.template Alloc<BatchNormParamType<T>>(saved_mean),
ctx.template Alloc<BatchNormParamType<T>>(saved_variance)));
#endif
#endif // CUDNN_VERSION_MIN(7, 4, 1)
}
#endif
}
}

Expand Down