cranelift/x64: Fix XmmRmREvex pretty-printing (#8508)

* cranelift/x64: Fix XmmRmREvex pretty-printing The operand collector had these operands in src1/src2/dst order, but the pretty-printer fetched the allocations in dst/src1/src2 order instead. Although our pretty-printer looked like it was printing src1/src2/dst, because it consumed operands in the wrong order, what it actually printed was src2/dst/src1. Meanwhile, Capstone actually uses src2/src1/dst order in AT&T mode. (GNU objdump agrees.) In the only filetest covering the vpsraq instruction, our output agreed with Capstone because register allocation picked the same register for both src1 and dst, so the two orders were indistinguishable. I've extended the filetest to force register allocation to pick different registers. This format is also used for vpmullq, but we didn't have any compile filetests covering that instruction, so I've added one with the same register allocation pattern. Now our pretty-printer agrees with Capstone on both instructions. * Fix emit-tests and vpermi2b This test for vpmullq had what we have now determined is the wrong order for src1 and src2. There were no emit-tests for vpsraq, so I added one. The vpermi2b tests used the wrong form of the Inst enum, judging by the assertions that are in x64_get_operands (which is not exercised by emit tests) and the fact that we never use that form for that instruction anywhere else. Pretty-printing vpermi2b disagreed with Capstone in the same way as vpsraq and vpmullq. I've fixed that form to agree with Capstone as well, aside from the duplicated src1/dst operand which are required to be different before register allocation and equal afterward.
bytecodealliance · May 1, 2024 · c66c874 · c66c874
1 parent d4e968c
commit c66c874
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 16 deletions.
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -61,6 +61,7 @@ impl Inst {
     }
 
     fn xmm_rm_r_evex(op: Avx512Opcode, src1: Reg, src2: RegMem, dst: Writable<Reg>) -> Self {
+        debug_assert_ne!(op, Avx512Opcode::Vpermi2b);
         src2.assert_regclass_is(RegClass::Float);
         debug_assert!(src1.class() == RegClass::Float);
         debug_assert!(dst.to_reg().class() == RegClass::Float);
@@ -72,6 +73,27 @@ impl Inst {
         }
     }
 
+    fn xmm_rm_r_evex3(
+        op: Avx512Opcode,
+        src1: Reg,
+        src2: Reg,
+        src3: RegMem,
+        dst: Writable<Reg>,
+    ) -> Self {
+        debug_assert_eq!(op, Avx512Opcode::Vpermi2b);
+        src3.assert_regclass_is(RegClass::Float);
+        debug_assert!(src1.class() == RegClass::Float);
+        debug_assert!(src2.class() == RegClass::Float);
+        debug_assert!(dst.to_reg().class() == RegClass::Float);
+        Inst::XmmRmREvex3 {
+            op,
+            src1: Xmm::new(src1).unwrap(),
+            src2: Xmm::new(src2).unwrap(),
+            src3: XmmMem::new(src3).unwrap(),
+            dst: WritableXmm::from_writable_reg(dst).unwrap(),
+        }
+    }
+
     // TODO Can be replaced by `Inst::move` (high-level) and `Inst::unary_rm_r` (low-level)
     fn xmm_mov(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Inst {
         src.assert_regclass_is(RegClass::Float);
@@ -4189,19 +4211,37 @@ fn test_x64_emit() {
     insns.push((
         Inst::xmm_rm_r_evex(Avx512Opcode::Vpmullq, xmm10, RegMem::reg(xmm14), w_xmm1),
         "62D2AD0840CE",
-        "vpmullq %xmm10, %xmm14, %xmm1",
+        "vpmullq %xmm14, %xmm10, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, xmm10, RegMem::reg(xmm14), w_xmm1),
+        Inst::xmm_rm_r_evex(Avx512Opcode::Vpsraq, xmm10, RegMem::reg(xmm14), w_xmm1),
+        "62D1AD08E2CE",
+        "vpsraq  %xmm14, %xmm10, %xmm1",
+    ));
+
+    insns.push((
+        Inst::xmm_rm_r_evex3(
+            Avx512Opcode::Vpermi2b,
+            xmm1,
+            xmm10,
+            RegMem::reg(xmm14),
+            w_xmm1,
+        ),
         "62D22D0875CE",
-        "vpermi2b %xmm10, %xmm14, %xmm1",
+        "vpermi2b %xmm14, %xmm10, %xmm1, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r_evex(Avx512Opcode::Vpermi2b, xmm0, RegMem::reg(xmm1), w_xmm2),
+        Inst::xmm_rm_r_evex3(
+            Avx512Opcode::Vpermi2b,
+            xmm2,
+            xmm0,
+            RegMem::reg(xmm1),
+            w_xmm2,
+        ),
         "62F27D0875D1",
-        "vpermi2b %xmm0, %xmm1, %xmm2",
+        "vpermi2b %xmm1, %xmm0, %xmm2, %xmm2",
     ));
 
     insns.push((

diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -1179,11 +1179,11 @@ impl PrettyPrint for Inst {
                 dst,
                 ..
             } => {
-                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
                 let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
                 let src2 = src2.pretty_print(8, allocs);
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
                 let op = ljustify(op.to_string());
-                format!("{op} {src1}, {src2}, {dst}")
+                format!("{op} {src2}, {src1}, {dst}")
             }
 
             Inst::XmmRmREvex3 {
@@ -1199,7 +1199,7 @@ impl PrettyPrint for Inst {
                 let src3 = src3.pretty_print(8, allocs);
                 let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
                 let op = ljustify(op.to_string());
-                format!("{op} {src1}, {src2}, {src3}, {dst}")
+                format!("{op} {src3}, {src2}, {src1}, {dst}")
             }
 
             Inst::XmmMinMaxSeq {

diff --git a/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif b/cranelift/filetests/filetests/isa/x64/shuffle-avx512.clif
@@ -15,7 +15,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   movdqa  %xmm0, %xmm5
 ;   movdqu  const(0), %xmm0
 ;   movdqa  %xmm5, %xmm6
-;   vpermi2b %xmm0, %xmm6, %xmm1, %xmm0
+;   vpermi2b %xmm1, %xmm6, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -54,7 +54,7 @@ block0(v0: i8x16, v1: i8x16):
 ;   movdqa  %xmm0, %xmm5
 ;   movdqu  const(0), %xmm0
 ;   movdqa  %xmm5, %xmm6
-;   vpermi2b %xmm0, %xmm6, %xmm1, %xmm0
+;   vpermi2b %xmm1, %xmm6, %xmm0, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret

diff --git a/cranelift/filetests/filetests/isa/x64/simd-i64x2-mul-avx512.clif b/cranelift/filetests/filetests/isa/x64/simd-i64x2-mul-avx512.clif
@@ -0,0 +1,33 @@
+test compile precise-output
+target x86_64 sse42 has_avx has_avx2 has_avx512dq has_avx512vl
+
+function %imul(i64x2, i64x2) -> i64x2, i64x2 {
+block0(v0: i64x2, v1: i64x2):
+  ;; Force register allocation to pick a different destination than
+  ;; source for at least one of these instructions.
+  v2 = imul v0, v1
+  v3 = imul v2, v1
+  return v2, v3
+}
+
+; VCode:
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   vpmullq %xmm1, %xmm0, %xmm0
+;   vpmullq %xmm1, %xmm0, %xmm1
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   pushq %rbp
+;   movq %rsp, %rbp
+; block1: ; offset 0x4
+;   vpmullq %xmm1, %xmm0, %xmm0
+;   vpmullq %xmm1, %xmm0, %xmm1
+;   movq %rbp, %rsp
+;   popq %rbp
+;   retq
+
diff --git a/cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif b/cranelift/filetests/filetests/isa/x64/simd-i64x2-shift-avx512.clif
@@ -1,19 +1,26 @@
 test compile precise-output
 target x86_64 sse42 has_avx has_avx2 has_avx512f has_avx512vl
 
-function %sshr(i64x2, i64) -> i64x2 {
+function %sshr(i64x2, i64) -> i64x2, i64x2 {
 block0(v0: i64x2, v1: i64):
+  ;; Force register allocation to pick a different destination than
+  ;; source for at least one of these instructions.
   v2 = sshr v0, v1
-  return v2
+  v3 = sshr v2, v1
+  return v2, v3
 }
 
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
+;   movq    %rdi, %r9
+;   andq    %r9, $63, %r9
+;   vmovd   %r9d, %xmm1
+;   vpsraq  %xmm1, %xmm0, %xmm0
 ;   andq    %rdi, $63, %rdi
-;   vmovd   %edi, %xmm5
-;   vpsraq  %xmm5, %xmm0, %xmm0
+;   vmovd   %edi, %xmm1
+;   vpsraq  %xmm1, %xmm0, %xmm1
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -23,9 +30,13 @@ block0(v0: i64x2, v1: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
+;   movq %rdi, %r9
+;   andq $0x3f, %r9
+;   vmovd %r9d, %xmm1
+;   vpsraq %xmm1, %xmm0, %xmm0
 ;   andq $0x3f, %rdi
-;   vmovd %edi, %xmm5
-;   vpsraq %xmm5, %xmm0, %xmm0
+;   vmovd %edi, %xmm1
+;   vpsraq %xmm1, %xmm0, %xmm1
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq