Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Inductor][Quant] Change the QConv output scale name #124246

2 changes: 1 addition & 1 deletion aten/src/ATen/native/quantized/cpu/OnednnUtils.h
Expand Up @@ -485,7 +485,7 @@ static at::Tensor _quantized_convolution_onednn(
torch::List<int64_t> dilation,
bool transposed,
int64_t groups,
double inv_output_scale,
double output_scale,
int64_t output_zero_point,
c10::optional<at::Tensor> accum=c10::nullopt, // accum to fused with conv add
double accum_scale=1.0,
Expand Down
26 changes: 13 additions & 13 deletions aten/src/ATen/native/quantized/cpu/qconv.cpp
Expand Up @@ -1397,7 +1397,7 @@ static at::Tensor _quantized_convolution_onednn(
torch::List<int64_t> dilation,
bool transposed,
int64_t groups,
double inv_output_scale, // inv_output_scale is the reciprocal of scale in fake quant
double output_scale,
int64_t output_zero_point,
c10::optional<at::Tensor> accum, // accum to fused with conv add
double accum_scale,
Expand All @@ -1420,10 +1420,10 @@ static at::Tensor _quantized_convolution_onednn(
bool bfloat16_output = output_dtype.has_value() && (output_dtype.value() == c10::kBFloat16);
if (fp32_output || bfloat16_output) {
// When fp32 or bf16 output, oneDNN expects op_attr doesn't set_scales and set_zero_points.
// So, we will use default inv_output_scale as 1.0 and output_zero_point as 0, since
// when inv_output_scale is 1.0, we will skip invoking of op_attr.set_scales in ideep;
// So, we will use default output_scale as 1.0 and output_zero_point as 0, since
// when output_scale is 1.0, we will skip invoking of op_attr.set_scales in ideep;
// when output_zero_point is 0, we will skip invoking of op_attr.set_zero_points in ideep.
TORCH_CHECK(inv_output_scale == 1.0, " (ONEDNN): fp32 or bf16 output, inv_output_scale must be 1.0.");
TORCH_CHECK(output_scale == 1.0, " (ONEDNN): fp32 or bf16 output, output_scale must be 1.0.");
TORCH_CHECK(output_zero_point == 0, " (ONEDNN): fp32 or bf16 output, output_zero_point must be 0");
}

Expand Down Expand Up @@ -1634,7 +1634,7 @@ static at::Tensor _quantized_convolution_onednn(
int oc_per_group = packed_weight.get_dim(0) / groups;
int wei_scale_mask = ideep::utils::conv_weight_scale_mask(weight_scales.numel(), oc_per_group, groups, false);
op_attr.set_scales_mask(DNNL_ARG_WEIGHTS, wei_scale_mask);
if (inv_output_scale != 1.0f) {
if (output_scale != 1.0f) {
op_attr.set_scales_mask(DNNL_ARG_DST, 0);
}
if (output_zero_point != 0) {
Expand Down Expand Up @@ -1671,13 +1671,13 @@ static at::Tensor _quantized_convolution_onednn(
}
tensor src_scales_t = tensor(ideep::scale_t(1, act_scale));
tensor wei_scales_t = tensor(weights_scales);
tensor dst_scales_t = tensor(ideep::scale_t(1, inv_output_scale));
tensor dst_scales_t = tensor(ideep::scale_t(1, output_scale));
tensor src_zp_t = tensor(ideep::zero_point_t(1, act_zero_point));
tensor dst_zp_t = tensor(ideep::zero_point_t(1, output_zero_point));
if (act_scale != 1.0f) {
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales_t});
}
if (inv_output_scale != 1.0f) {
if (output_scale != 1.0f) {
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales_t});
}
args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scales_t});
Expand All @@ -1697,7 +1697,7 @@ static at::Tensor _quantized_convolution_onednn(
const ideep::scale_t accum_ideep_scale = ideep::scale_t(1, 1.0/accum_scale);
const ideep::zero_point_t accum_ideep_zero_points = ideep::zero_point_t(1, accum_zero_point);
// Set the dst scale and zero point with the value of accum.
// The true scale and zero point is stored in ideep::scale_t(scale_size, inv_output_scale) and dst_zero_points.
// The true scale and zero point is stored in ideep::scale_t(scale_size, output_scale) and dst_zero_points.
dst.set_scale(accum_ideep_scale);
dst.set_zero_point(accum_ideep_zero_points);
}
Expand All @@ -1707,7 +1707,7 @@ static at::Tensor _quantized_convolution_onednn(
ideep::convolution_forward::prepare(
params, src, packed_weight, expected_bias, dst_dims, dst,
stride.vec(), dilation.vec(), padding.vec(), padding.vec(), groups,
src_scales, weights_scales, ideep::scale_t(1, 1.0f / inv_output_scale),
src_scales, weights_scales, ideep::scale_t(1, 1.0f / output_scale),
src_zero_points, dst_zero_points,
op_attr, dnnl::algorithm::convolution_direct,
dnnl::prop_kind::forward_inference,
Expand Down Expand Up @@ -1872,7 +1872,7 @@ class QConvoneDNN final {
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double inv_output_scale, // inv_output_scale is the reciprocal of scale in fake quant
double output_scale,
int64_t output_zero_point,
c10::optional<c10::ScalarType> output_dtype,
c10::string_view attr,
Expand Down Expand Up @@ -1900,7 +1900,7 @@ class QConvoneDNN final {
act, act_scale, act_zero_point,
weight, weight_scales, weight_zero_points,
bias, stride, padding, dilation, /*transposed*/false,
groups, inv_output_scale, output_zero_point,
groups, output_scale, output_zero_point,
/*accum*/c10::nullopt, /*accum_scale*/0.0, /*accum_zero_point*/0,
/*output_dtype*/output_dtype, /*binary_attr*/c10::nullopt, /*binary_alpha*/c10::nullopt,
/*unary_attr*/attr, /*unary_scalars*/scalars, /*unary_algorithm*/algorithm
Expand All @@ -1924,7 +1924,7 @@ class QConvoneDNN final {
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double inv_output_scale, // inv_output_scale is the reciprocal of scale in fake quant
double output_scale,
int64_t output_zero_point,
c10::optional<c10::ScalarType> output_dtype,
c10::string_view binary_attr,
Expand Down Expand Up @@ -1952,7 +1952,7 @@ class QConvoneDNN final {
act, act_scale, act_zero_point,
weight, weight_scales, weight_zero_points,
bias, stride, padding, dilation, /*transposed*/false,
groups, inv_output_scale, output_zero_point,
groups, output_scale, output_zero_point,
accum, accum_scale, accum_zero_point,
/*output_dtype*/output_dtype, binary_attr, alpha,
unary_attr, unary_scalars, unary_algorithm
Expand Down
8 changes: 4 additions & 4 deletions aten/src/ATen/native/quantized/library.cpp
Expand Up @@ -257,12 +257,12 @@ TORCH_LIBRARY(onednn, m) {
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv_prepack(Tensor weight, Tensor w_scales, float x_scale, int x_zp, int[] stride, int[] padding, int[] dilation, int groups, int[]? x_shape=None) -> Tensor"));

// Conv1D/2D/3D with unary postop
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv1d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv1d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor"));

// Conv2D with binary postop
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qaccum, float accum_scale, int accum_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor"));
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qaccum, float accum_scale, int accum_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor"));

// Linear prepack
m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_prepack(Tensor weight, int[]? x_shape) -> Tensor"));
Expand Down
Expand Up @@ -136,11 +136,10 @@
("aten::batch_norm_backward_elemt", datetime.date(2023, 12, 31)),
("aten::sym_constrain_range", datetime.date(2023, 12, 31)),
("aten::_efficient_attention_forward", datetime.date(2024, 1, 15)),
("onednn::qconv1d_pointwise", datetime.date(2023, 12, 31)),
("onednn::qconv2d_pointwise", datetime.date(2023, 12, 31)),
("onednn::qconv3d_pointwise", datetime.date(2023, 12, 31)),
("onednn::qconv2d_pointwise.binary", datetime.date(2023, 12, 31)),
("onednn::qlinear_pointwise", datetime.date(2023, 12, 31)),
("onednn::qconv1d_pointwise", datetime.date(2024, 12, 31)),
("onednn::qconv2d_pointwise", datetime.date(2024, 12, 31)),
("onednn::qconv3d_pointwise", datetime.date(2024, 12, 31)),
("onednn::qconv2d_pointwise.binary", datetime.date(2024, 12, 31)),
]

ALLOW_LIST_COMPILED = [
Expand Down
4 changes: 2 additions & 2 deletions torch/_inductor/fx_passes/quantization.py
Expand Up @@ -174,7 +174,7 @@ def get_dequantize_qconv_pt2e_pattern(users=1):
KeywordArg("padding"),
KeywordArg("dilation"),
KeywordArg("groups"),
KeywordArg("inv_output_scale"), # inv_output_scale = 1.0
KeywordArg("output_scale"), # output_scale = 1.0
KeywordArg("output_zero_point"), # output_zero_point = 0
KeywordArg("output_dtype"), # output_dtype = None
KeywordArg("attr"), # attr = "none"
Expand Down Expand Up @@ -1526,7 +1526,7 @@ def qconv_weight_prepack(match: Match, *args, **kwargs):
padding,
dilation,
groups,
1.0, # inv_output_scale
1.0, # output_scale
0, # output_zero_point
dtype, # output_dtype
"none", # attr
Expand Down
6 changes: 3 additions & 3 deletions torch/_inductor/ir.py
Expand Up @@ -6657,7 +6657,7 @@ def __init__(
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double inv_output_scale,
double output_scale,
int64_t output_zero_point,
c10::optional<c10::ScalarType> output_dtype,
c10::string_view attr,
Expand Down Expand Up @@ -6833,7 +6833,7 @@ def __init__(
torch::List<int64_t> padding,
torch::List<int64_t> dilation,
int64_t groups,
double inv_output_scale,
double output_scale,
int64_t output_zero_point,
c10::optional<c10::ScalarType> output_dtype,
c10::string_view binary_attr,
Expand Down Expand Up @@ -7049,7 +7049,7 @@ def __init__(
at::Tensor weight_scales,
at::Tensor weight_zero_points,
c10::optional<at::Tensor> bias,
double inv_output_scale,
double output_scale,
int64_t output_zero_point,
c10::optional<c10::ScalarType> output_dtype,
c10::string_view post_op_name,
Expand Down