Skip to content

Commit

Permalink
cranelift/x64: Fix XmmRmREvex pretty-printing (#8508)
Browse files Browse the repository at this point in the history
* cranelift/x64: Fix XmmRmREvex pretty-printing

The operand collector had these operands in src1/src2/dst order, but the
pretty-printer fetched the allocations in dst/src1/src2 order instead.

Although our pretty-printer looked like it was printing src1/src2/dst,
because it consumed operands in the wrong order, what it actually
printed was src2/dst/src1.

Meanwhile, Capstone actually uses src2/src1/dst order in AT&T mode. (GNU
objdump agrees.)

In the only filetest covering the vpsraq instruction, our output agreed
with Capstone because register allocation picked the same register for
both src1 and dst, so the two orders were indistinguishable. I've
extended the filetest to force register allocation to pick different
registers.

This format is also used for vpmullq, but we didn't have any compile
filetests covering that instruction, so I've added one with the same
register allocation pattern.

Now our pretty-printer agrees with Capstone on both instructions.

* Fix emit-tests and vpermi2b

This test for vpmullq had what we have now determined is the wrong order
for src1 and src2.

There were no emit-tests for vpsraq, so I added one.

The vpermi2b tests used the wrong form of the Inst enum, judging by the
assertions that are in x64_get_operands (which is not exercised by emit
tests) and the fact that we never use that form for that instruction
anywhere else.

Pretty-printing vpermi2b disagreed with Capstone in the same way as
vpsraq and vpmullq. I've fixed that form to agree with Capstone as well,
aside from the duplicated src1/dst operand which are required to be
different before register allocation and equal afterward.
  • Loading branch information
jameysharp committed May 1, 2024
1 parent d4e968c commit c66c874
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 16 deletions.
50 changes: 45 additions & 5 deletions cranelift/codegen/src/isa/x64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ impl Inst {
}

fn xmm_rm_r_evex(op: Avx512Opcode, src1: Reg, src2: RegMem, dst: Writable<Reg>) -> Self {
debug_assert_ne!(op, Avx512Opcode::Vpermi2b);
src2.assert_regclass_is(RegClass::Float);
debug_assert!(src1.class() == RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Expand All @@ -72,6 +73,27 @@ impl Inst {
}
}

fn xmm_rm_r_evex3(
op: Avx512Opcode,
src1: Reg,
src2: Reg,
src3: RegMem,
dst: Writable<Reg>,
) -> Self {
debug_assert_eq!(op, Avx512Opcode::Vpermi2b);
src3.assert_regclass_is(RegClass::Float);
debug_assert!(src1.class() == RegClass::Float);
debug_assert!(src2.class() == RegClass::Float);
debug_assert!(dst.to_reg().class() == RegClass::Float);
Inst::XmmRmREvex3 {
op,
src1: Xmm::new(src1).unwrap(),
src2: Xmm::new(src2).unwrap(),
src3: XmmMem::new(src3).unwrap(),
dst: WritableXmm::from_writable_reg(dst).unwrap(),
}
}

// TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
src.assert_regclass_is(RegClass::Float);
Expand Down Expand Up @@ -4189,19 +4211,37 @@ fn test_x64_emit() {
insns.push((
Inst::xmm_rm_r_evex(Avx512Opcode::Vpmullq, xmm10, RegMem::reg(xmm14), w_xmm1),
"62D2AD0840CE",
"vpmullq %xmm10, %xmm14, %xmm1",
"vpmullq %xmm14, %xmm10, %xmm1",
));

insns.push((
Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, xmm10, RegMem::reg(xmm14), w_xmm1),
Inst::xmm_rm_r_evex(Avx512Opcode::Vpsraq, xmm10, RegMem::reg(xmm14), w_xmm1),
"62D1AD08E2CE",
"vpsraq %xmm14, %xmm10, %xmm1",
));

insns.push((
Inst::xmm_rm_r_evex3(
Avx512Opcode::Vpermi2b,
xmm1,
xmm10,
RegMem::reg(xmm14),
w_xmm1,
),
"62D22D0875CE",
"vpermi2b %xmm10, %xmm14, %xmm1",
"vpermi2b %xmm14, %xmm10, %xmm1, %xmm1",
));

insns.push((
Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, xmm0, RegMem::reg(xmm1), w_xmm2),
Inst::xmm_rm_r_evex3(
Avx512Opcode::Vpermi2b,
xmm2,
xmm0,
RegMem::reg(xmm1),
w_xmm2,
),
"62F27D0875D1",
"vpermi2b %xmm0, %xmm1, %xmm2",
"vpermi2b %xmm1, %xmm0, %xmm2, %xmm2",
));

insns.push((
Expand Down
6 changes: 3 additions & 3 deletions cranelift/codegen/src/isa/x64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1179,11 +1179,11 @@ impl PrettyPrint for Inst {
dst,
..
} => {
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
let src2 = src2.pretty_print(8, allocs);
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
let op = ljustify(op.to_string());
format!("{op} {src1}, {src2}, {dst}")
format!("{op} {src2}, {src1}, {dst}")
}

Inst::XmmRmREvex3 {
Expand All @@ -1199,7 +1199,7 @@ impl PrettyPrint for Inst {
let src3 = src3.pretty_print(8, allocs);
let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
let op = ljustify(op.to_string());
format!("{op} {src1}, {src2}, {src3}, {dst}")
format!("{op} {src3}, {src2}, {src1}, {dst}")
}

Inst::XmmMinMaxSeq {
Expand Down
4 changes: 2 additions & 2 deletions cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ block0(v0: i8x16, v1: i8x16):
; movdqa %xmm0, %xmm5
; movdqu const(0), %xmm0
; movdqa %xmm5, %xmm6
; vpermi2b %xmm0, %xmm6, %xmm1, %xmm0
; vpermi2b %xmm1, %xmm6, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand Down Expand Up @@ -54,7 +54,7 @@ block0(v0: i8x16, v1: i8x16):
; movdqa %xmm0, %xmm5
; movdqu const(0), %xmm0
; movdqa %xmm5, %xmm6
; vpermi2b %xmm0, %xmm6, %xmm1, %xmm0
; vpermi2b %xmm1, %xmm6, %xmm0, %xmm0
; movq %rbp, %rsp
; popq %rbp
; ret
Expand Down
33 changes: 33 additions & 0 deletions cranelift/filetests/filetests/isa/x64/simd-i64x2-mul-avx512.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
test compile precise-output
target x86_64 sse42 has_avx has_avx2 has_avx512dq has_avx512vl

function %imul(i64x2, i64x2) -> i64x2, i64x2 {
block0(v0: i64x2, v1: i64x2):
;; Force register allocation to pick a different destination than
;; source for at least one of these instructions.
v2 = imul v0, v1
v3 = imul v2, v1
return v2, v3
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; vpmullq %xmm1, %xmm0, %xmm0
; vpmullq %xmm1, %xmm0, %xmm1
; movq %rbp, %rsp
; popq %rbp
; ret
;
; Disassembled:
; block0: ; offset 0x0
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; vpmullq %xmm1, %xmm0, %xmm0
; vpmullq %xmm1, %xmm0, %xmm1
; movq %rbp, %rsp
; popq %rbp
; retq

23 changes: 17 additions & 6 deletions cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
test compile precise-output
target x86_64 sse42 has_avx has_avx2 has_avx512f has_avx512vl

function %sshr(i64x2, i64) -> i64x2 {
function %sshr(i64x2, i64) -> i64x2, i64x2 {
block0(v0: i64x2, v1: i64):
;; Force register allocation to pick a different destination than
;; source for at least one of these instructions.
v2 = sshr v0, v1
return v2
v3 = sshr v2, v1
return v2, v3
}

; VCode:
; pushq %rbp
; movq %rsp, %rbp
; block0:
; movq %rdi, %r9
; andq %r9, $63, %r9
; vmovd %r9d, %xmm1
; vpsraq %xmm1, %xmm0, %xmm0
; andq %rdi, $63, %rdi
; vmovd %edi, %xmm5
; vpsraq %xmm5, %xmm0, %xmm0
; vmovd %edi, %xmm1
; vpsraq %xmm1, %xmm0, %xmm1
; movq %rbp, %rsp
; popq %rbp
; ret
Expand All @@ -23,9 +30,13 @@ block0(v0: i64x2, v1: i64):
; pushq %rbp
; movq %rsp, %rbp
; block1: ; offset 0x4
; movq %rdi, %r9
; andq $0x3f, %r9
; vmovd %r9d, %xmm1
; vpsraq %xmm1, %xmm0, %xmm0
; andq $0x3f, %rdi
; vmovd %edi, %xmm5
; vpsraq %xmm5, %xmm0, %xmm0
; vmovd %edi, %xmm1
; vpsraq %xmm1, %xmm0, %xmm1
; movq %rbp, %rsp
; popq %rbp
; retq
Expand Down

0 comments on commit c66c874

Please sign in to comment.