diff --git a/lib/compiler-singlepass/Cargo.toml b/lib/compiler-singlepass/Cargo.toml
index d512297620c..ac2d36e7813 100644
--- a/lib/compiler-singlepass/Cargo.toml
+++ b/lib/compiler-singlepass/Cargo.toml
@@ -32,6 +32,8 @@ target-lexicon = { version = "0.12.2", default-features = false }
 maintenance = { status = "actively-developed" }
 
 [features]
-default = ["std", "rayon"]
+default = ["std", "rayon", "avx"]
 std = ["wasmer-compiler/std", "wasmer-types/std"]
 core = ["hashbrown", "wasmer-types/core"]
+sse = []
+avx = []
diff --git a/lib/compiler-singlepass/src/compiler.rs b/lib/compiler-singlepass/src/compiler.rs
index 0e0b08f8cb4..ed53ef90292 100644
--- a/lib/compiler-singlepass/src/compiler.rs
+++ b/lib/compiler-singlepass/src/compiler.rs
@@ -69,13 +69,16 @@ impl Compiler for SinglepassCompiler {
                 ))
             }
         }
-        if target.triple().architecture == Architecture::X86_64
-            && !target.cpu_features().contains(CpuFeature::AVX)
-        {
+
+        let simd_arch = if target.cpu_features().contains(CpuFeature::AVX) {
+            CpuFeature::AVX
+        } else if target.cpu_features().contains(CpuFeature::SSE42) {
+            CpuFeature::SSE42
+        } else {
             return Err(CompileError::UnsupportedTarget(
-                "x86_64 without AVX".to_string(),
+                "x86_64 without AVX or SSE 4.2".to_string(),
             ));
-        }
+        };
         if compile_info.features.multi_value {
             return Err(CompileError::UnsupportedFeature("multivalue".to_string()));
         }
@@ -131,7 +134,7 @@ impl Compiler for SinglepassCompiler {
 
                 match target.triple().architecture {
                     Architecture::X86_64 => {
-                        let machine = MachineX86_64::new();
+                        let machine = MachineX86_64::new(Some(simd_arch));
                         let mut generator = FuncGen::new(
                             module,
                             &self.config,
diff --git a/lib/compiler-singlepass/src/emitter_x64.rs b/lib/compiler-singlepass/src/emitter_x64.rs
index 71f1b9128ba..7c773007753 100644
--- a/lib/compiler-singlepass/src/emitter_x64.rs
+++ b/lib/compiler-singlepass/src/emitter_x64.rs
@@ -2,13 +2,11 @@ use crate::common_decl::Size;
 use crate::location::Location as AbstractLocation;
 pub use crate::location::Multiplier;
 pub use crate::machine::{Label, Offset};
+use crate::machine_x64::AssemblerX64;
 pub use crate::x64_decl::{GPR, XMM};
 use dynasm::dynasm;
-use dynasmrt::{
-    x64::X64Relocation, AssemblyOffset, DynamicLabel, DynasmApi, DynasmLabelApi, VecAssembler,
-};
-
-type Assembler = VecAssembler<X64Relocation>;
+use dynasmrt::{AssemblyOffset, DynamicLabel, DynasmApi, DynasmLabelApi};
+use wasmer_compiler::CpuFeature;
 
 /// Force `dynasm!` to use the correct arch (x64) when cross-compiling.
 /// `dynasm!` proc-macro tries to auto-detect it by default by looking at the
@@ -17,7 +15,7 @@ type Assembler = VecAssembler<X64Relocation>;
 macro_rules! dynasm {
     ($a:expr ; $($tt:tt)*) => {
         dynasm::dynasm!(
-            $a
+            $a.inner
             ; .arch x64
             ; $($tt)*
         )
@@ -57,7 +55,13 @@ pub enum GPROrMemory {
     Memory(GPR, i32),
 }
 
+pub enum Precision {
+    Single,
+    Double,
+}
+
 pub trait EmitterX64 {
+    fn get_simd_arch(&self) -> Option<&CpuFeature>;
     fn get_label(&mut self) -> Label;
     fn get_offset(&self) -> Offset;
     fn get_jmp_instr_size(&self) -> u8;
@@ -471,154 +475,289 @@ macro_rules! jmp_op {
     }
 }
 
+/// Move a single or double precision XMM value to another if src and destination
+/// are not the same.
+///
+/// TODO: Can we assume data is aligned and packed? If so, this function isn't necessary
+/// TODO: as we can use [`EmitterX64::emit_vmovaps`] and [`EmitterX64::emit_vmovadp`]
+/// TODO: instead
+fn move_src_to_dst(emitter: &mut AssemblerX64, precision: Precision, src: XMM, dst: XMM) {
+    if src == dst {
+        return;
+    }
+    match precision {
+        Precision::Single => match src {
+            XMM::XMM0 => dynasm!(emitter ; movss Rx((dst as u8)), xmm0),
+            XMM::XMM1 => dynasm!(emitter ; movss Rx((dst as u8)), xmm1),
+            XMM::XMM2 => dynasm!(emitter ; movss Rx((dst as u8)), xmm2),
+            XMM::XMM3 => dynasm!(emitter ; movss Rx((dst as u8)), xmm3),
+            XMM::XMM4 => dynasm!(emitter ; movss Rx((dst as u8)), xmm4),
+            XMM::XMM5 => dynasm!(emitter ; movss Rx((dst as u8)), xmm5),
+            XMM::XMM6 => dynasm!(emitter ; movss Rx((dst as u8)), xmm6),
+            XMM::XMM7 => dynasm!(emitter ; movss Rx((dst as u8)), xmm7),
+            XMM::XMM8 => dynasm!(emitter ; movss Rx((dst as u8)), xmm8),
+            XMM::XMM9 => dynasm!(emitter ; movss Rx((dst as u8)), xmm9),
+            XMM::XMM10 => dynasm!(emitter ; movss Rx((dst as u8)), xmm10),
+            XMM::XMM11 => dynasm!(emitter ; movss Rx((dst as u8)), xmm11),
+            XMM::XMM12 => dynasm!(emitter ; movss Rx((dst as u8)), xmm12),
+            XMM::XMM13 => dynasm!(emitter ; movss Rx((dst as u8)), xmm13),
+            XMM::XMM14 => dynasm!(emitter ; movss Rx((dst as u8)), xmm14),
+            XMM::XMM15 => dynasm!(emitter ; movss Rx((dst as u8)), xmm15),
+        },
+        Precision::Double => match src {
+            XMM::XMM0 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm0),
+            XMM::XMM1 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm1),
+            XMM::XMM2 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm2),
+            XMM::XMM3 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm3),
+            XMM::XMM4 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm4),
+            XMM::XMM5 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm5),
+            XMM::XMM6 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm6),
+            XMM::XMM7 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm7),
+            XMM::XMM8 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm8),
+            XMM::XMM9 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm9),
+            XMM::XMM10 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm10),
+            XMM::XMM11 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm11),
+            XMM::XMM12 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm12),
+            XMM::XMM13 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm13),
+            XMM::XMM14 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm14),
+            XMM::XMM15 => dynasm!(emitter ; movsd Rx((dst as u8)), xmm15),
+        },
+    }
+}
+
 macro_rules! avx_fn {
-    ($ins:ident, $name:ident) => {
-        fn $name(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+    ($ins:ident) => {
+        |emitter: &mut AssemblerX64, src1: XMM, src2: XMMOrMemory, dst: XMM| {
             // Dynasm bug: AVX instructions are not encoded correctly.
             match src2 {
                 XMMOrMemory::XMM(x) => match src1 {
-                    XMM::XMM0 => dynasm!(self ; $ins Rx((dst as u8)), xmm0, Rx((x as u8))),
-                    XMM::XMM1 => dynasm!(self ; $ins Rx((dst as u8)), xmm1, Rx((x as u8))),
-                    XMM::XMM2 => dynasm!(self ; $ins Rx((dst as u8)), xmm2, Rx((x as u8))),
-                    XMM::XMM3 => dynasm!(self ; $ins Rx((dst as u8)), xmm3, Rx((x as u8))),
-                    XMM::XMM4 => dynasm!(self ; $ins Rx((dst as u8)), xmm4, Rx((x as u8))),
-                    XMM::XMM5 => dynasm!(self ; $ins Rx((dst as u8)), xmm5, Rx((x as u8))),
-                    XMM::XMM6 => dynasm!(self ; $ins Rx((dst as u8)), xmm6, Rx((x as u8))),
-                    XMM::XMM7 => dynasm!(self ; $ins Rx((dst as u8)), xmm7, Rx((x as u8))),
-                    XMM::XMM8 => dynasm!(self ; $ins Rx((dst as u8)), xmm8, Rx((x as u8))),
-                    XMM::XMM9 => dynasm!(self ; $ins Rx((dst as u8)), xmm9, Rx((x as u8))),
-                    XMM::XMM10 => dynasm!(self ; $ins Rx((dst as u8)), xmm10, Rx((x as u8))),
-                    XMM::XMM11 => dynasm!(self ; $ins Rx((dst as u8)), xmm11, Rx((x as u8))),
-                    XMM::XMM12 => dynasm!(self ; $ins Rx((dst as u8)), xmm12, Rx((x as u8))),
-                    XMM::XMM13 => dynasm!(self ; $ins Rx((dst as u8)), xmm13, Rx((x as u8))),
-                    XMM::XMM14 => dynasm!(self ; $ins Rx((dst as u8)), xmm14, Rx((x as u8))),
-                    XMM::XMM15 => dynasm!(self ; $ins Rx((dst as u8)), xmm15, Rx((x as u8))),
+                    XMM::XMM0 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm0, Rx((x as u8))),
+                    XMM::XMM1 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm1, Rx((x as u8))),
+                    XMM::XMM2 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm2, Rx((x as u8))),
+                    XMM::XMM3 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm3, Rx((x as u8))),
+                    XMM::XMM4 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm4, Rx((x as u8))),
+                    XMM::XMM5 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm5, Rx((x as u8))),
+                    XMM::XMM6 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm6, Rx((x as u8))),
+                    XMM::XMM7 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm7, Rx((x as u8))),
+                    XMM::XMM8 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm8, Rx((x as u8))),
+                    XMM::XMM9 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm9, Rx((x as u8))),
+                    XMM::XMM10 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm10, Rx((x as u8))),
+                    XMM::XMM11 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm11, Rx((x as u8))),
+                    XMM::XMM12 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm12, Rx((x as u8))),
+                    XMM::XMM13 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm13, Rx((x as u8))),
+                    XMM::XMM14 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm14, Rx((x as u8))),
+                    XMM::XMM15 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm15, Rx((x as u8))),
                 },
                 XMMOrMemory::Memory(base, disp) => match src1 {
-                    XMM::XMM0 => dynasm!(self ; $ins Rx((dst as u8)), xmm0, [Rq((base as u8)) + disp]),
-                    XMM::XMM1 => dynasm!(self ; $ins Rx((dst as u8)), xmm1, [Rq((base as u8)) + disp]),
-                    XMM::XMM2 => dynasm!(self ; $ins Rx((dst as u8)), xmm2, [Rq((base as u8)) + disp]),
-                    XMM::XMM3 => dynasm!(self ; $ins Rx((dst as u8)), xmm3, [Rq((base as u8)) + disp]),
-                    XMM::XMM4 => dynasm!(self ; $ins Rx((dst as u8)), xmm4, [Rq((base as u8)) + disp]),
-                    XMM::XMM5 => dynasm!(self ; $ins Rx((dst as u8)), xmm5, [Rq((base as u8)) + disp]),
-                    XMM::XMM6 => dynasm!(self ; $ins Rx((dst as u8)), xmm6, [Rq((base as u8)) + disp]),
-                    XMM::XMM7 => dynasm!(self ; $ins Rx((dst as u8)), xmm7, [Rq((base as u8)) + disp]),
-                    XMM::XMM8 => dynasm!(self ; $ins Rx((dst as u8)), xmm8, [Rq((base as u8)) + disp]),
-                    XMM::XMM9 => dynasm!(self ; $ins Rx((dst as u8)), xmm9, [Rq((base as u8)) + disp]),
-                    XMM::XMM10 => dynasm!(self ; $ins Rx((dst as u8)), xmm10, [Rq((base as u8)) + disp]),
-                    XMM::XMM11 => dynasm!(self ; $ins Rx((dst as u8)), xmm11, [Rq((base as u8)) + disp]),
-                    XMM::XMM12 => dynasm!(self ; $ins Rx((dst as u8)), xmm12, [Rq((base as u8)) + disp]),
-                    XMM::XMM13 => dynasm!(self ; $ins Rx((dst as u8)), xmm13, [Rq((base as u8)) + disp]),
-                    XMM::XMM14 => dynasm!(self ; $ins Rx((dst as u8)), xmm14, [Rq((base as u8)) + disp]),
-                    XMM::XMM15 => dynasm!(self ; $ins Rx((dst as u8)), xmm15, [Rq((base as u8)) + disp]),
+                    XMM::XMM0 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm0, [Rq((base as u8)) + disp]),
+                    XMM::XMM1 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm1, [Rq((base as u8)) + disp]),
+                    XMM::XMM2 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm2, [Rq((base as u8)) + disp]),
+                    XMM::XMM3 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm3, [Rq((base as u8)) + disp]),
+                    XMM::XMM4 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm4, [Rq((base as u8)) + disp]),
+                    XMM::XMM5 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm5, [Rq((base as u8)) + disp]),
+                    XMM::XMM6 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm6, [Rq((base as u8)) + disp]),
+                    XMM::XMM7 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm7, [Rq((base as u8)) + disp]),
+                    XMM::XMM8 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm8, [Rq((base as u8)) + disp]),
+                    XMM::XMM9 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm9, [Rq((base as u8)) + disp]),
+                    XMM::XMM10 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm10, [Rq((base as u8)) + disp]),
+                    XMM::XMM11 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm11, [Rq((base as u8)) + disp]),
+                    XMM::XMM12 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm12, [Rq((base as u8)) + disp]),
+                    XMM::XMM13 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm13, [Rq((base as u8)) + disp]),
+                    XMM::XMM14 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm14, [Rq((base as u8)) + disp]),
+                    XMM::XMM15 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm15, [Rq((base as u8)) + disp]),
                 },
             }
         }
     }
 }
 
+macro_rules! sse_fn {
+    ($ins:ident) => {
+        |emitter: &mut AssemblerX64, precision: Precision, src1: XMM, src2: XMMOrMemory, dst: XMM| {
+            match src2 {
+                XMMOrMemory::XMM(x) => {
+                    assert_ne!(x, dst);
+                    move_src_to_dst(emitter, precision, src1, dst);
+                    dynasm!(emitter ; $ins Rx((dst as u8)), Rx((x as u8)))
+                }
+                XMMOrMemory::Memory(base, disp) => {
+                    move_src_to_dst(emitter, precision, src1, dst);
+                    dynasm!(emitter ; $ins Rx((dst as u8)), [Rq((base as u8)) + disp])
+                }
+            }
+        }
+    };
+    ($ins:ident, $mode:expr) => {
+        |emitter: &mut AssemblerX64, precision: Precision, src1: XMM, src2: XMMOrMemory, dst: XMM| {
+            match src2 {
+                XMMOrMemory::XMM(x) => {
+                    move_src_to_dst(emitter, precision, src1, dst);
+                    dynasm!(emitter ; $ins Rx((dst as u8)), Rx((x as u8)), $mode)
+                }
+                XMMOrMemory::Memory(base, disp) => {
+                    move_src_to_dst(emitter, precision, src1, dst);
+                    dynasm!(emitter ; $ins Rx((dst as u8)), [Rq((base as u8)) + disp], $mode)
+                }
+            }
+        }
+    };
+}
+
 macro_rules! avx_i2f_64_fn {
-    ($ins:ident, $name:ident) => {
-        fn $name(&mut self, src1: XMM, src2: GPROrMemory, dst: XMM) {
+    ($ins:ident) => {
+        |emitter: &mut AssemblerX64, src1: XMM, src2: GPROrMemory, dst: XMM| {
             match src2 {
                 GPROrMemory::GPR(x) => match src1 {
-                    XMM::XMM0 => dynasm!(self ; $ins Rx((dst as u8)), xmm0, Rq((x as u8))),
-                    XMM::XMM1 => dynasm!(self ; $ins Rx((dst as u8)), xmm1, Rq((x as u8))),
-                    XMM::XMM2 => dynasm!(self ; $ins Rx((dst as u8)), xmm2, Rq((x as u8))),
-                    XMM::XMM3 => dynasm!(self ; $ins Rx((dst as u8)), xmm3, Rq((x as u8))),
-                    XMM::XMM4 => dynasm!(self ; $ins Rx((dst as u8)), xmm4, Rq((x as u8))),
-                    XMM::XMM5 => dynasm!(self ; $ins Rx((dst as u8)), xmm5, Rq((x as u8))),
-                    XMM::XMM6 => dynasm!(self ; $ins Rx((dst as u8)), xmm6, Rq((x as u8))),
-                    XMM::XMM7 => dynasm!(self ; $ins Rx((dst as u8)), xmm7, Rq((x as u8))),
-                    XMM::XMM8 => dynasm!(self ; $ins Rx((dst as u8)), xmm8, Rq((x as u8))),
-                    XMM::XMM9 => dynasm!(self ; $ins Rx((dst as u8)), xmm9, Rq((x as u8))),
-                    XMM::XMM10 => dynasm!(self ; $ins Rx((dst as u8)), xmm10, Rq((x as u8))),
-                    XMM::XMM11 => dynasm!(self ; $ins Rx((dst as u8)), xmm11, Rq((x as u8))),
-                    XMM::XMM12 => dynasm!(self ; $ins Rx((dst as u8)), xmm12, Rq((x as u8))),
-                    XMM::XMM13 => dynasm!(self ; $ins Rx((dst as u8)), xmm13, Rq((x as u8))),
-                    XMM::XMM14 => dynasm!(self ; $ins Rx((dst as u8)), xmm14, Rq((x as u8))),
-                    XMM::XMM15 => dynasm!(self ; $ins Rx((dst as u8)), xmm15, Rq((x as u8))),
+                    XMM::XMM0 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm0, Rq((x as u8))),
+                    XMM::XMM1 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm1, Rq((x as u8))),
+                    XMM::XMM2 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm2, Rq((x as u8))),
+                    XMM::XMM3 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm3, Rq((x as u8))),
+                    XMM::XMM4 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm4, Rq((x as u8))),
+                    XMM::XMM5 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm5, Rq((x as u8))),
+                    XMM::XMM6 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm6, Rq((x as u8))),
+                    XMM::XMM7 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm7, Rq((x as u8))),
+                    XMM::XMM8 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm8, Rq((x as u8))),
+                    XMM::XMM9 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm9, Rq((x as u8))),
+                    XMM::XMM10 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm10, Rq((x as u8))),
+                    XMM::XMM11 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm11, Rq((x as u8))),
+                    XMM::XMM12 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm12, Rq((x as u8))),
+                    XMM::XMM13 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm13, Rq((x as u8))),
+                    XMM::XMM14 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm14, Rq((x as u8))),
+                    XMM::XMM15 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm15, Rq((x as u8))),
                 },
                 GPROrMemory::Memory(base, disp) => match src1 {
-                    XMM::XMM0 => dynasm!(self ; $ins Rx((dst as u8)), xmm0, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM1 => dynasm!(self ; $ins Rx((dst as u8)), xmm1, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM2 => dynasm!(self ; $ins Rx((dst as u8)), xmm2, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM3 => dynasm!(self ; $ins Rx((dst as u8)), xmm3, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM4 => dynasm!(self ; $ins Rx((dst as u8)), xmm4, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM5 => dynasm!(self ; $ins Rx((dst as u8)), xmm5, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM6 => dynasm!(self ; $ins Rx((dst as u8)), xmm6, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM7 => dynasm!(self ; $ins Rx((dst as u8)), xmm7, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM8 => dynasm!(self ; $ins Rx((dst as u8)), xmm8, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM9 => dynasm!(self ; $ins Rx((dst as u8)), xmm9, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM10 => dynasm!(self ; $ins Rx((dst as u8)), xmm10, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM11 => dynasm!(self ; $ins Rx((dst as u8)), xmm11, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM12 => dynasm!(self ; $ins Rx((dst as u8)), xmm12, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM13 => dynasm!(self ; $ins Rx((dst as u8)), xmm13, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM14 => dynasm!(self ; $ins Rx((dst as u8)), xmm14, QWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM15 => dynasm!(self ; $ins Rx((dst as u8)), xmm15, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM0 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm0, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM1 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm1, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM2 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm2, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM3 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm3, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM4 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm4, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM5 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm5, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM6 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm6, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM7 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm7, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM8 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm8, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM9 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm9, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM10 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm10, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM11 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm11, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM12 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm12, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM13 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm13, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM14 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm14, QWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM15 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm15, QWORD [Rq((base as u8)) + disp]),
                 },
             }
         }
     }
 }
 
+macro_rules! sse_i2f_64_fn {
+    ($ins:ident) => {
+        |emitter: &mut AssemblerX64, precision: Precision, src1: XMM, src2: GPROrMemory, dst: XMM| {
+            match src2 {
+                GPROrMemory::GPR(x) => {
+                    move_src_to_dst(emitter, precision, src1, dst);
+                    dynasm!(emitter ; $ins Rx((dst as u8)), Rq((x as u8)))
+                },
+                GPROrMemory::Memory(base, disp) => {
+                    move_src_to_dst(emitter, precision, src1, dst);
+                    dynasm!(emitter ; $ins Rx((dst as u8)), QWORD [Rq((base as u8)) + disp])
+                }
+            }
+        }
+    }
+}
+
 macro_rules! avx_i2f_32_fn {
-    ($ins:ident, $name:ident) => {
-        fn $name(&mut self, src1: XMM, src2: GPROrMemory, dst: XMM) {
+    ($ins:ident) => {
+        |emitter: &mut AssemblerX64, src1: XMM, src2: GPROrMemory, dst: XMM| {
             match src2 {
                 GPROrMemory::GPR(x) => match src1 {
-                    XMM::XMM0 => dynasm!(self ; $ins Rx((dst as u8)), xmm0, Rd((x as u8))),
-                    XMM::XMM1 => dynasm!(self ; $ins Rx((dst as u8)), xmm1, Rd((x as u8))),
-                    XMM::XMM2 => dynasm!(self ; $ins Rx((dst as u8)), xmm2, Rd((x as u8))),
-                    XMM::XMM3 => dynasm!(self ; $ins Rx((dst as u8)), xmm3, Rd((x as u8))),
-                    XMM::XMM4 => dynasm!(self ; $ins Rx((dst as u8)), xmm4, Rd((x as u8))),
-                    XMM::XMM5 => dynasm!(self ; $ins Rx((dst as u8)), xmm5, Rd((x as u8))),
-                    XMM::XMM6 => dynasm!(self ; $ins Rx((dst as u8)), xmm6, Rd((x as u8))),
-                    XMM::XMM7 => dynasm!(self ; $ins Rx((dst as u8)), xmm7, Rd((x as u8))),
-                    XMM::XMM8 => dynasm!(self ; $ins Rx((dst as u8)), xmm8, Rd((x as u8))),
-                    XMM::XMM9 => dynasm!(self ; $ins Rx((dst as u8)), xmm9, Rd((x as u8))),
-                    XMM::XMM10 => dynasm!(self ; $ins Rx((dst as u8)), xmm10, Rd((x as u8))),
-                    XMM::XMM11 => dynasm!(self ; $ins Rx((dst as u8)), xmm11, Rd((x as u8))),
-                    XMM::XMM12 => dynasm!(self ; $ins Rx((dst as u8)), xmm12, Rd((x as u8))),
-                    XMM::XMM13 => dynasm!(self ; $ins Rx((dst as u8)), xmm13, Rd((x as u8))),
-                    XMM::XMM14 => dynasm!(self ; $ins Rx((dst as u8)), xmm14, Rd((x as u8))),
-                    XMM::XMM15 => dynasm!(self ; $ins Rx((dst as u8)), xmm15, Rd((x as u8))),
+                    XMM::XMM0 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm0, Rd((x as u8))),
+                    XMM::XMM1 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm1, Rd((x as u8))),
+                    XMM::XMM2 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm2, Rd((x as u8))),
+                    XMM::XMM3 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm3, Rd((x as u8))),
+                    XMM::XMM4 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm4, Rd((x as u8))),
+                    XMM::XMM5 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm5, Rd((x as u8))),
+                    XMM::XMM6 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm6, Rd((x as u8))),
+                    XMM::XMM7 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm7, Rd((x as u8))),
+                    XMM::XMM8 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm8, Rd((x as u8))),
+                    XMM::XMM9 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm9, Rd((x as u8))),
+                    XMM::XMM10 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm10, Rd((x as u8))),
+                    XMM::XMM11 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm11, Rd((x as u8))),
+                    XMM::XMM12 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm12, Rd((x as u8))),
+                    XMM::XMM13 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm13, Rd((x as u8))),
+                    XMM::XMM14 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm14, Rd((x as u8))),
+                    XMM::XMM15 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm15, Rd((x as u8))),
                 },
                 GPROrMemory::Memory(base, disp) => match src1 {
-                    XMM::XMM0 => dynasm!(self ; $ins Rx((dst as u8)), xmm0, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM1 => dynasm!(self ; $ins Rx((dst as u8)), xmm1, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM2 => dynasm!(self ; $ins Rx((dst as u8)), xmm2, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM3 => dynasm!(self ; $ins Rx((dst as u8)), xmm3, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM4 => dynasm!(self ; $ins Rx((dst as u8)), xmm4, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM5 => dynasm!(self ; $ins Rx((dst as u8)), xmm5, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM6 => dynasm!(self ; $ins Rx((dst as u8)), xmm6, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM7 => dynasm!(self ; $ins Rx((dst as u8)), xmm7, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM8 => dynasm!(self ; $ins Rx((dst as u8)), xmm8, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM9 => dynasm!(self ; $ins Rx((dst as u8)), xmm9, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM10 => dynasm!(self ; $ins Rx((dst as u8)), xmm10, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM11 => dynasm!(self ; $ins Rx((dst as u8)), xmm11, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM12 => dynasm!(self ; $ins Rx((dst as u8)), xmm12, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM13 => dynasm!(self ; $ins Rx((dst as u8)), xmm13, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM14 => dynasm!(self ; $ins Rx((dst as u8)), xmm14, DWORD [Rq((base as u8)) + disp]),
-                    XMM::XMM15 => dynasm!(self ; $ins Rx((dst as u8)), xmm15, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM0 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm0, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM1 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm1, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM2 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm2, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM3 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm3, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM4 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm4, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM5 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm5, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM6 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm6, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM7 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm7, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM8 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm8, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM9 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm9, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM10 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm10, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM11 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm11, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM12 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm12, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM13 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm13, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM14 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm14, DWORD [Rq((base as u8)) + disp]),
+                    XMM::XMM15 => dynasm!(emitter ; $ins Rx((dst as u8)), xmm15, DWORD [Rq((base as u8)) + disp]),
+                },
+            }
+        }
+    }
+}
+
+macro_rules! sse_i2f_32_fn {
+    ($ins:ident) => {
+        |emitter: &mut AssemblerX64, precision: Precision, src1: XMM, src2: GPROrMemory, dst: XMM| {
+            match src2 {
+                GPROrMemory::GPR(x) => {
+                    move_src_to_dst(emitter, precision, src1, dst);
+                    dynasm!(emitter; $ins Rx((src1 as u8)), Rd((x as u8)))
                 },
+                GPROrMemory::Memory(base, disp) => {
+                    move_src_to_dst(emitter, precision, src1, dst);
+                    dynasm!(emitter ;  $ins Rx((dst as u8)), DWORD [Rq((base as u8)) + disp])
+                }
             }
         }
     }
 }
 
 macro_rules! avx_round_fn {
-    ($ins:ident, $name:ident, $mode:expr) => {
-        fn $name(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+    ($ins:ident, $mode:expr) => {
+        |emitter: &mut AssemblerX64, src1: XMM, src2: XMMOrMemory, dst: XMM| {
             match src2 {
-                XMMOrMemory::XMM(x) => dynasm!(self ; $ins Rx((dst as u8)), Rx((src1 as u8)), Rx((x as u8)), $mode),
-                XMMOrMemory::Memory(base, disp) => dynasm!(self ; $ins Rx((dst as u8)), Rx((src1 as u8)), [Rq((base as u8)) + disp], $mode),
+                XMMOrMemory::XMM(x) => dynasm!(emitter ; $ins Rx((dst as u8)), Rx((src1 as u8)), Rx((x as u8)), $mode),
+                XMMOrMemory::Memory(base, disp) => dynasm!(emitter ; $ins Rx((dst as u8)), Rx((src1 as u8)), [Rq((base as u8)) + disp], $mode),
             }
         }
     }
 }
 
-impl EmitterX64 for Assembler {
+macro_rules! sse_round_fn {
+    ($ins:ident, $mode:expr) => {
+        |emitter: &mut AssemblerX64, precision: Precision, src1: XMM, src2: XMMOrMemory, dst: XMM| {
+            match src2 {
+                XMMOrMemory::XMM(x) => {
+                    assert_eq!(src1, x);
+                    move_src_to_dst(emitter, precision, src1, dst);
+                    dynasm!(emitter ; $ins Rx((dst as u8)), Rx((dst as u8)), $mode)
+                }
+                XMMOrMemory::Memory(..) => unreachable!(),
+            }
+        }
+    }
+}
+
+impl EmitterX64 for AssemblerX64 {
+    fn get_simd_arch(&self) -> Option<&CpuFeature> {
+        self.simd_arch.as_ref()
+    }
+
     fn get_label(&mut self) -> DynamicLabel {
         self.new_dynamic_label()
     }
@@ -904,7 +1043,7 @@ impl EmitterX64 for Assembler {
         }
     }
     fn emit_cmp(&mut self, sz: Size, left: Location, right: Location) {
-        // Constant elimination for comparision between consts.
+        // Constant elimination for comparison between consts.
         //
         // Only needed for `emit_cmp`, since other binary operators actually write to `right` and `right` must
         // be a writable location for them.
@@ -1254,91 +1393,402 @@ impl EmitterX64 for Assembler {
             _ => panic!("singlepass can't emit VMOVAPD {:?} {:?}", src, dst),
         };
     }
-
-    avx_fn!(vxorps, emit_vxorps);
-    avx_fn!(vxorpd, emit_vxorpd);
-
-    avx_fn!(vaddss, emit_vaddss);
-    avx_fn!(vaddsd, emit_vaddsd);
-
-    avx_fn!(vsubss, emit_vsubss);
-    avx_fn!(vsubsd, emit_vsubsd);
-
-    avx_fn!(vmulss, emit_vmulss);
-    avx_fn!(vmulsd, emit_vmulsd);
-
-    avx_fn!(vdivss, emit_vdivss);
-    avx_fn!(vdivsd, emit_vdivsd);
-
-    avx_fn!(vmaxss, emit_vmaxss);
-    avx_fn!(vmaxsd, emit_vmaxsd);
-
-    avx_fn!(vminss, emit_vminss);
-    avx_fn!(vminsd, emit_vminsd);
-
-    avx_fn!(vcmpeqss, emit_vcmpeqss);
-    avx_fn!(vcmpeqsd, emit_vcmpeqsd);
-
-    avx_fn!(vcmpneqss, emit_vcmpneqss);
-    avx_fn!(vcmpneqsd, emit_vcmpneqsd);
-
-    avx_fn!(vcmpltss, emit_vcmpltss);
-    avx_fn!(vcmpltsd, emit_vcmpltsd);
-
-    avx_fn!(vcmpless, emit_vcmpless);
-    avx_fn!(vcmplesd, emit_vcmplesd);
-
-    avx_fn!(vcmpgtss, emit_vcmpgtss);
-    avx_fn!(vcmpgtsd, emit_vcmpgtsd);
-
-    avx_fn!(vcmpgess, emit_vcmpgess);
-    avx_fn!(vcmpgesd, emit_vcmpgesd);
-
-    avx_fn!(vcmpunordss, emit_vcmpunordss);
-    avx_fn!(vcmpunordsd, emit_vcmpunordsd);
-
-    avx_fn!(vcmpordss, emit_vcmpordss);
-    avx_fn!(vcmpordsd, emit_vcmpordsd);
-
-    avx_fn!(vsqrtss, emit_vsqrtss);
-    avx_fn!(vsqrtsd, emit_vsqrtsd);
-
-    avx_fn!(vcvtss2sd, emit_vcvtss2sd);
-    avx_fn!(vcvtsd2ss, emit_vcvtsd2ss);
-
-    avx_round_fn!(vroundss, emit_vroundss_nearest, 0);
-    avx_round_fn!(vroundss, emit_vroundss_floor, 1);
-    avx_round_fn!(vroundss, emit_vroundss_ceil, 2);
-    avx_round_fn!(vroundss, emit_vroundss_trunc, 3);
-    avx_round_fn!(vroundsd, emit_vroundsd_nearest, 0);
-    avx_round_fn!(vroundsd, emit_vroundsd_floor, 1);
-    avx_round_fn!(vroundsd, emit_vroundsd_ceil, 2);
-    avx_round_fn!(vroundsd, emit_vroundsd_trunc, 3);
-
-    avx_i2f_32_fn!(vcvtsi2ss, emit_vcvtsi2ss_32);
-    avx_i2f_32_fn!(vcvtsi2sd, emit_vcvtsi2sd_32);
-    avx_i2f_64_fn!(vcvtsi2ss, emit_vcvtsi2ss_64);
-    avx_i2f_64_fn!(vcvtsi2sd, emit_vcvtsi2sd_64);
-
-    fn emit_vblendvps(&mut self, src1: XMM, src2: XMMOrMemory, mask: XMM, dst: XMM) {
-        match src2 {
-            XMMOrMemory::XMM(src2) => {
-                dynasm!(self ; vblendvps Rx(dst as u8), Rx(mask as u8), Rx(src2 as u8), Rx(src1 as u8))
+    fn emit_vxorps(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vxorps)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(xorps)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vxorpd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vxorpd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(xorpd)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vaddss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vaddss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(addss)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vaddsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vaddsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(addsd)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vsubss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vsubss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(subss)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vsubsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vsubsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(subsd)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vmulss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vmulss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(mulss)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vmulsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vmulsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(mulsd)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vdivss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vdivss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(divss)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vdivsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vdivsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(divsd)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vmaxss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vmaxss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(maxss)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vmaxsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vmaxsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(maxsd)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vminss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vminss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(minss)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vminsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vminsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(minsd)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpeqss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpeqss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpss, 0)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpeqsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpeqsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpsd, 0)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpneqss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpneqss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpss, 4)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpneqsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpneqsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpsd, 4)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpltss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpltss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpss, 1)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpltsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpltsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpsd, 1)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpless(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpless)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpss, 2)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmplesd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmplesd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpsd, 2)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpgtss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpgtss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpss, 6)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpgtsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpgtsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpsd, 6)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpgess(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpgess)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpss, 5)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpgesd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpgesd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpsd, 5)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpunordss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpunordss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpss, 3)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpunordsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpunordsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpsd, 3)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpordss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpordss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpss, 7)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcmpordsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcmpordsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cmpsd, 7)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vsqrtss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vsqrtss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(sqrtss)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vsqrtsd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vsqrtsd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(sqrtsd)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcvtss2sd(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcvtss2sd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cvtss2sd)(self, Precision::Single, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vcvtsd2ss(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_fn!(vcvtsd2ss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => sse_fn!(cvtsd2ss)(self, Precision::Double, src1, src2, dst),
+            _ => {}
+        }
+    }
+    fn emit_vroundss_nearest(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_round_fn!(vroundss, 0)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_round_fn!(roundss, 0)(self, Precision::Single, src1, src2, dst)
             }
-            XMMOrMemory::Memory(base, disp) => {
-                dynasm!(self ; vblendvps Rx(dst as u8), Rx(mask as u8), [Rq(base as u8) + disp], Rx(src1 as u8))
+            _ => {}
+        }
+    }
+    fn emit_vroundsd_nearest(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_round_fn!(vroundsd, 0)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_round_fn!(roundsd, 0)(self, Precision::Double, src1, src2, dst)
             }
+            _ => {}
         }
     }
-
-    fn emit_vblendvpd(&mut self, src1: XMM, src2: XMMOrMemory, mask: XMM, dst: XMM) {
-        match src2 {
-            XMMOrMemory::XMM(src2) => {
-                dynasm!(self ; vblendvpd Rx(dst as u8), Rx(mask as u8), Rx(src2 as u8), Rx(src1 as u8))
+    fn emit_vroundss_floor(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_round_fn!(vroundss, 1)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_round_fn!(roundss, 1)(self, Precision::Single, src1, src2, dst)
             }
-            XMMOrMemory::Memory(base, disp) => {
-                dynasm!(self ; vblendvpd Rx(dst as u8), Rx(mask as u8), [Rq(base as u8) + disp], Rx(src1 as u8))
+            _ => {}
+        }
+    }
+    fn emit_vroundsd_floor(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_round_fn!(vroundsd, 1)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_round_fn!(roundsd, 1)(self, Precision::Double, src1, src2, dst)
+            }
+            _ => {}
+        }
+    }
+    fn emit_vroundss_ceil(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_round_fn!(vroundss, 2)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_round_fn!(roundss, 2)(self, Precision::Single, src1, src2, dst)
+            }
+            _ => {}
+        }
+    }
+    fn emit_vroundsd_ceil(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_round_fn!(vroundsd, 2)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_round_fn!(roundsd, 2)(self, Precision::Double, src1, src2, dst)
+            }
+            _ => {}
+        }
+    }
+    fn emit_vroundss_trunc(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_round_fn!(vroundss, 3)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_round_fn!(roundss, 3)(self, Precision::Single, src1, src2, dst)
+            }
+            _ => {}
+        }
+    }
+    fn emit_vroundsd_trunc(&mut self, src1: XMM, src2: XMMOrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_round_fn!(vroundsd, 3)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_round_fn!(roundsd, 3)(self, Precision::Double, src1, src2, dst)
+            }
+            _ => {}
+        }
+    }
+    fn emit_vcvtsi2ss_32(&mut self, src1: XMM, src2: GPROrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_i2f_32_fn!(vcvtsi2ss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_i2f_32_fn!(cvtsi2ss)(self, Precision::Single, src1, src2, dst)
+            }
+            _ => {}
+        }
+    }
+    fn emit_vcvtsi2sd_32(&mut self, src1: XMM, src2: GPROrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_i2f_32_fn!(vcvtsi2sd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_i2f_32_fn!(cvtsi2sd)(self, Precision::Double, src1, src2, dst)
+            }
+            _ => {}
+        }
+    }
+    fn emit_vcvtsi2ss_64(&mut self, src1: XMM, src2: GPROrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_i2f_64_fn!(vcvtsi2ss)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_i2f_64_fn!(cvtsi2ss)(self, Precision::Single, src1, src2, dst)
             }
+            _ => {}
+        }
+    }
+    fn emit_vcvtsi2sd_64(&mut self, src1: XMM, src2: GPROrMemory, dst: XMM) {
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => avx_i2f_64_fn!(vcvtsi2sd)(self, src1, src2, dst),
+            Some(CpuFeature::SSE42) => {
+                sse_i2f_64_fn!(cvtsi2sd)(self, Precision::Double, src1, src2, dst)
+            }
+            _ => {}
+        }
+    }
+
+    fn emit_vblendvps(&mut self, src1: XMM, src2: XMMOrMemory, mask: XMM, dst: XMM) {
+        // this implementation works only for sse 4.1 and greater
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => match src2 {
+                XMMOrMemory::XMM(src2) => {
+                    // TODO: this argument order does not match the documentation??
+                    dynasm!( self; vblendvps Rx(dst as u8), Rx(mask as u8), Rx(src2 as u8), Rx(src1 as u8))
+                }
+                XMMOrMemory::Memory(base, disp) => {
+                    dynasm!( self; vblendvps Rx(dst as u8), Rx(mask as u8), [Rq(base as u8) + disp], Rx(src1 as u8))
+                }
+            },
+            Some(CpuFeature::SSE42) => match src2 {
+                XMMOrMemory::XMM(src2) => {
+                    move_src_to_dst(self, Precision::Single, src1, dst);
+                    dynasm!( self; blendvps Rx(dst as u8), Rx(src2 as u8))
+                }
+                XMMOrMemory::Memory(base, disp) => {
+                    move_src_to_dst(self, Precision::Single, src1, dst);
+                    dynasm!( self; blendvps Rx(dst as u8), [Rq(base as u8) + disp])
+                }
+            },
+            _ => {}
+        }
+    }
+
+    fn emit_vblendvpd(&mut self, src1: XMM, src2: XMMOrMemory, mask: XMM, dst: XMM) {
+        // this implementation works only for sse 4.1 and greater
+        match self.get_simd_arch() {
+            Some(CpuFeature::AVX) => match src2 {
+                XMMOrMemory::XMM(src2) => {
+                    // TODO: this argument order does not match the documentation??
+                    dynasm!( self; vblendvpd Rx(dst as u8), Rx(mask as u8), Rx(src2 as u8), Rx(src1 as u8))
+                }
+                XMMOrMemory::Memory(base, disp) => {
+                    dynasm!( self; vblendvpd Rx(dst as u8), Rx(mask as u8), [Rq(base as u8) + disp], Rx(src1 as u8))
+                }
+            },
+            Some(CpuFeature::SSE42) => match src2 {
+                XMMOrMemory::XMM(src2) => {
+                    move_src_to_dst(self, Precision::Double, src1, dst);
+                    dynasm!( self; blendvpd Rx(dst as u8), Rx(src2 as u8))
+                }
+                XMMOrMemory::Memory(base, disp) => {
+                    move_src_to_dst(self, Precision::Double, src1, dst);
+                    dynasm!( self; blendvpd Rx(dst as u8), [Rq(base as u8) + disp])
+                }
+            },
+            _ => {}
         }
     }
 
diff --git a/lib/compiler-singlepass/src/machine.rs b/lib/compiler-singlepass/src/machine.rs
index 1ad5d04254e..d7359342e9f 100644
--- a/lib/compiler-singlepass/src/machine.rs
+++ b/lib/compiler-singlepass/src/machine.rs
@@ -8,8 +8,8 @@ use std::fmt::Debug;
 pub use wasmer_compiler::wasmparser::MemoryImmediate;
 use wasmer_compiler::wasmparser::Type as WpType;
 use wasmer_compiler::{
-    Architecture, CallingConvention, CustomSection, FunctionBody, InstructionAddressMap,
-    Relocation, RelocationTarget, Target, TrapInformation,
+    Architecture, CallingConvention, CpuFeature, CustomSection, FunctionBody,
+    InstructionAddressMap, Relocation, RelocationTarget, Target, TrapInformation,
 };
 use wasmer_types::{FunctionIndex, FunctionType};
 use wasmer_vm::{TrapCode, VMOffsets};
@@ -2200,7 +2200,13 @@ pub fn gen_std_trampoline(
 ) -> FunctionBody {
     match target.triple().architecture {
         Architecture::X86_64 => {
-            let machine = MachineX86_64::new();
+            let machine = if target.cpu_features().contains(CpuFeature::AVX) {
+                MachineX86_64::new(Some(CpuFeature::AVX))
+            } else if target.cpu_features().contains(CpuFeature::SSE42) {
+                MachineX86_64::new(Some(CpuFeature::SSE42))
+            } else {
+                unimplemented!()
+            };
             machine.gen_std_trampoline(sig, calling_convention)
         }
         Architecture::Aarch64(_) => {
@@ -2210,6 +2216,7 @@ pub fn gen_std_trampoline(
         _ => unimplemented!(),
     }
 }
+
 /// Generates dynamic import function call trampoline for a function type.
 pub fn gen_std_dynamic_import_trampoline(
     vmoffsets: &VMOffsets,
@@ -2219,7 +2226,13 @@ pub fn gen_std_dynamic_import_trampoline(
 ) -> FunctionBody {
     match target.triple().architecture {
         Architecture::X86_64 => {
-            let machine = MachineX86_64::new();
+            let machine = if target.cpu_features().contains(CpuFeature::AVX) {
+                MachineX86_64::new(Some(CpuFeature::AVX))
+            } else if target.cpu_features().contains(CpuFeature::SSE42) {
+                MachineX86_64::new(Some(CpuFeature::SSE42))
+            } else {
+                unimplemented!()
+            };
             machine.gen_std_dynamic_import_trampoline(vmoffsets, sig, calling_convention)
         }
         Architecture::Aarch64(_) => {
@@ -2239,7 +2252,13 @@ pub fn gen_import_call_trampoline(
 ) -> CustomSection {
     match target.triple().architecture {
         Architecture::X86_64 => {
-            let machine = MachineX86_64::new();
+            let machine = if target.cpu_features().contains(CpuFeature::AVX) {
+                MachineX86_64::new(Some(CpuFeature::AVX))
+            } else if target.cpu_features().contains(CpuFeature::SSE42) {
+                MachineX86_64::new(Some(CpuFeature::SSE42))
+            } else {
+                unimplemented!()
+            };
             machine.gen_import_call_trampoline(vmoffsets, index, sig, calling_convention)
         }
         Architecture::Aarch64(_) => {
diff --git a/lib/compiler-singlepass/src/machine_x64.rs b/lib/compiler-singlepass/src/machine_x64.rs
index c5595b989b8..a463a010f32 100644
--- a/lib/compiler-singlepass/src/machine_x64.rs
+++ b/lib/compiler-singlepass/src/machine_x64.rs
@@ -5,20 +5,58 @@ use crate::location::Reg;
 use crate::machine::*;
 use crate::x64_decl::new_machine_state;
 use crate::x64_decl::{ArgumentRegisterAllocator, X64Register, GPR, XMM};
-use dynasmrt::{x64::X64Relocation, VecAssembler};
+use dynasmrt::{x64::X64Relocation, DynasmError, VecAssembler};
+use std::ops::{Deref, DerefMut};
 use wasmer_compiler::wasmparser::Type as WpType;
 use wasmer_compiler::{
-    CallingConvention, CustomSection, CustomSectionProtection, FunctionBody, InstructionAddressMap,
-    Relocation, RelocationKind, RelocationTarget, SectionBody, SourceLoc, TrapInformation,
+    CallingConvention, CpuFeature, CustomSection, CustomSectionProtection, FunctionBody,
+    InstructionAddressMap, Relocation, RelocationKind, RelocationTarget, SectionBody, SourceLoc,
+    TrapInformation,
 };
 use wasmer_types::{FunctionIndex, FunctionType, Type};
 use wasmer_vm::{TrapCode, VMOffsets};
 
 type Assembler = VecAssembler<X64Relocation>;
+
+pub struct AssemblerX64 {
+    /// the actual inner
+    pub inner: Assembler,
+    /// the simd instructions set on the target.
+    /// Currently only supports SSE 4.2 and AVX
+    pub simd_arch: Option<CpuFeature>,
+}
+
+impl AssemblerX64 {
+    fn new(baseaddr: usize, simd_arch: Option<CpuFeature>) -> Self {
+        Self {
+            inner: Assembler::new(baseaddr),
+            simd_arch,
+        }
+    }
+
+    fn finalize(self) -> Result<Vec<u8>, DynasmError> {
+        self.inner.finalize()
+    }
+}
+
+impl Deref for AssemblerX64 {
+    type Target = Assembler;
+
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
+
+impl DerefMut for AssemblerX64 {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.inner
+    }
+}
+
 type Location = AbstractLocation<GPR, XMM>;
 
 pub struct MachineX86_64 {
-    assembler: Assembler,
+    assembler: AssemblerX64,
     used_gprs: u32,
     used_simd: u32,
     trap_table: TrapTable,
@@ -31,9 +69,9 @@ pub struct MachineX86_64 {
 }
 
 impl MachineX86_64 {
-    pub fn new() -> Self {
+    pub fn new(simd_arch: Option<CpuFeature>) -> Self {
         MachineX86_64 {
-            assembler: Assembler::new(0),
+            assembler: AssemblerX64::new(0, simd_arch),
             used_gprs: 0,
             used_simd: 0,
             trap_table: TrapTable::default(),
@@ -43,7 +81,7 @@ impl MachineX86_64 {
     }
     pub fn emit_relaxed_binop(
         &mut self,
-        op: fn(&mut Assembler, Size, Location, Location),
+        op: fn(&mut AssemblerX64, Size, Location, Location),
         sz: Size,
         src: Location,
         dst: Location,
@@ -56,11 +94,11 @@ impl MachineX86_64 {
         }
         let mode = match (src, dst) {
             (Location::GPR(_), Location::GPR(_))
-                if (op as *const u8 == Assembler::emit_imul as *const u8) =>
+                if (op as *const u8 == AssemblerX64::emit_imul as *const u8) =>
             {
                 RelaxMode::Direct
             }
-            _ if (op as *const u8 == Assembler::emit_imul as *const u8) => RelaxMode::BothToGPR,
+            _ if (op as *const u8 == AssemblerX64::emit_imul as *const u8) => RelaxMode::BothToGPR,
 
             (Location::Memory(_, _), Location::Memory(_, _)) => RelaxMode::SrcToGPR,
             (Location::Imm64(_), Location::Imm64(_)) | (Location::Imm64(_), Location::Imm32(_)) => {
@@ -69,7 +107,7 @@ impl MachineX86_64 {
             (_, Location::Imm32(_)) | (_, Location::Imm64(_)) => RelaxMode::DstToGPR,
             (Location::Imm64(_), Location::Memory(_, _)) => RelaxMode::SrcToGPR,
             (Location::Imm64(_), Location::GPR(_))
-                if (op as *const u8 != Assembler::emit_mov as *const u8) =>
+                if (op as *const u8 != AssemblerX64::emit_mov as *const u8) =>
             {
                 RelaxMode::SrcToGPR
             }
@@ -117,7 +155,7 @@ impl MachineX86_64 {
     }
     pub fn emit_relaxed_zx_sx(
         &mut self,
-        op: fn(&mut Assembler, Size, Location, Size, Location),
+        op: fn(&mut AssemblerX64, Size, Location, Size, Location),
         sz_src: Size,
         src: Location,
         sz_dst: Size,
@@ -187,7 +225,7 @@ impl MachineX86_64 {
     /// I32 binary operation with both operands popped from the virtual stack.
     fn emit_binop_i32(
         &mut self,
-        f: fn(&mut Assembler, Size, Location, Location),
+        f: fn(&mut AssemblerX64, Size, Location, Location),
         loc_a: Location,
         loc_b: Location,
         ret: Location,
@@ -205,7 +243,7 @@ impl MachineX86_64 {
     /// I64 binary operation with both operands popped from the virtual stack.
     fn emit_binop_i64(
         &mut self,
-        f: fn(&mut Assembler, Size, Location, Location),
+        f: fn(&mut AssemblerX64, Size, Location, Location),
         loc_a: Location,
         loc_b: Location,
         ret: Location,
@@ -252,7 +290,7 @@ impl MachineX86_64 {
     /// I64 shift with both operands popped from the virtual stack.
     fn emit_shift_i64(
         &mut self,
-        f: fn(&mut Assembler, Size, Location, Location),
+        f: fn(&mut AssemblerX64, Size, Location, Location),
         loc_a: Location,
         loc_b: Location,
         ret: Location,
@@ -269,7 +307,7 @@ impl MachineX86_64 {
     /// Moves `loc` to a valid location for `div`/`idiv`.
     fn emit_relaxed_xdiv(
         &mut self,
-        op: fn(&mut Assembler, Size, Location),
+        op: fn(&mut AssemblerX64, Size, Location),
         sz: Size,
         loc: Location,
         integer_division_by_zero: Label,
@@ -326,7 +364,7 @@ impl MachineX86_64 {
     /// I32 shift with both operands popped from the virtual stack.
     fn emit_shift_i32(
         &mut self,
-        f: fn(&mut Assembler, Size, Location, Location),
+        f: fn(&mut AssemblerX64, Size, Location, Location),
         loc_a: Location,
         loc_b: Location,
         ret: Location,
@@ -359,7 +397,7 @@ impl MachineX86_64 {
         let (base_loc, bound_loc) = if imported_memories {
             // Imported memories require one level of indirection.
             self.emit_relaxed_binop(
-                Assembler::emit_mov,
+                AssemblerX64::emit_mov,
                 Size::S64,
                 Location::Memory(self.get_vmctx_reg(), offset),
                 Location::GPR(tmp_addr),
@@ -806,7 +844,7 @@ impl MachineX86_64 {
     /// Moves `src1` and `src2` to valid locations and possibly adds a layer of indirection for `dst` for AVX instructions.
     fn emit_relaxed_avx(
         &mut self,
-        op: fn(&mut Assembler, XMM, XMMOrMemory, XMM),
+        op: fn(&mut AssemblerX64, XMM, XMMOrMemory, XMM),
         src1: Location,
         src2: Location,
         dst: Location,
@@ -1552,7 +1590,7 @@ impl MachineX86_64 {
     }
 
     fn emit_relaxed_atomic_xchg(&mut self, sz: Size, src: Location, dst: Location) {
-        self.emit_relaxed_binop(Assembler::emit_xchg, sz, src, dst);
+        self.emit_relaxed_binop(AssemblerX64::emit_xchg, sz, src, dst);
     }
 
     fn used_gprs_contains(&self, r: &GPR) -> bool {
@@ -2385,10 +2423,10 @@ impl Machine for MachineX86_64 {
 
     // relaxed binop based...
     fn emit_relaxed_mov(&mut self, sz: Size, src: Location, dst: Location) {
-        self.emit_relaxed_binop(Assembler::emit_mov, sz, src, dst);
+        self.emit_relaxed_binop(AssemblerX64::emit_mov, sz, src, dst);
     }
     fn emit_relaxed_cmp(&mut self, sz: Size, src: Location, dst: Location) {
-        self.emit_relaxed_binop(Assembler::emit_cmp, sz, src, dst);
+        self.emit_relaxed_binop(AssemblerX64::emit_cmp, sz, src, dst);
     }
     fn emit_relaxed_zero_extension(
         &mut self,
@@ -2398,9 +2436,9 @@ impl Machine for MachineX86_64 {
         dst: Location,
     ) {
         if (sz_src == Size::S32 || sz_src == Size::S64) && sz_dst == Size::S64 {
-            self.emit_relaxed_binop(Assembler::emit_mov, sz_src, src, dst);
+            self.emit_relaxed_binop(AssemblerX64::emit_mov, sz_src, src, dst);
         } else {
-            self.emit_relaxed_zx_sx(Assembler::emit_movzx, sz_src, src, sz_dst, dst);
+            self.emit_relaxed_zx_sx(AssemblerX64::emit_movzx, sz_src, src, sz_dst, dst);
         }
     }
     fn emit_relaxed_sign_extension(
@@ -2410,17 +2448,17 @@ impl Machine for MachineX86_64 {
         sz_dst: Size,
         dst: Location,
     ) {
-        self.emit_relaxed_zx_sx(Assembler::emit_movsx, sz_src, src, sz_dst, dst);
+        self.emit_relaxed_zx_sx(AssemblerX64::emit_movsx, sz_src, src, sz_dst, dst);
     }
 
     fn emit_binop_add32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i32(Assembler::emit_add, loc_a, loc_b, ret);
+        self.emit_binop_i32(AssemblerX64::emit_add, loc_a, loc_b, ret);
     }
     fn emit_binop_sub32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i32(Assembler::emit_sub, loc_a, loc_b, ret);
+        self.emit_binop_i32(AssemblerX64::emit_sub, loc_a, loc_b, ret);
     }
     fn emit_binop_mul32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i32(Assembler::emit_imul, loc_a, loc_b, ret);
+        self.emit_binop_i32(AssemblerX64::emit_imul, loc_a, loc_b, ret);
     }
     fn emit_binop_udiv32(
         &mut self,
@@ -2436,7 +2474,7 @@ impl Machine for MachineX86_64 {
         self.assembler
             .emit_xor(Size::S32, Location::GPR(GPR::RDX), Location::GPR(GPR::RDX));
         let offset = self.emit_relaxed_xdiv(
-            Assembler::emit_div,
+            AssemblerX64::emit_div,
             Size::S32,
             loc_b,
             integer_division_by_zero,
@@ -2458,7 +2496,7 @@ impl Machine for MachineX86_64 {
             .emit_mov(Size::S32, loc_a, Location::GPR(GPR::RAX));
         self.assembler.emit_cdq();
         let offset = self.emit_relaxed_xdiv(
-            Assembler::emit_idiv,
+            AssemblerX64::emit_idiv,
             Size::S32,
             loc_b,
             integer_division_by_zero,
@@ -2481,7 +2519,7 @@ impl Machine for MachineX86_64 {
         self.assembler
             .emit_xor(Size::S32, Location::GPR(GPR::RDX), Location::GPR(GPR::RDX));
         let offset = self.emit_relaxed_xdiv(
-            Assembler::emit_div,
+            AssemblerX64::emit_div,
             Size::S32,
             loc_b,
             integer_division_by_zero,
@@ -2514,7 +2552,7 @@ impl Machine for MachineX86_64 {
             .emit_mov(Size::S32, loc_a, Location::GPR(GPR::RAX));
         self.assembler.emit_cdq();
         let offset = self.emit_relaxed_xdiv(
-            Assembler::emit_idiv,
+            AssemblerX64::emit_idiv,
             Size::S32,
             loc_b,
             integer_division_by_zero,
@@ -2526,13 +2564,13 @@ impl Machine for MachineX86_64 {
         offset
     }
     fn emit_binop_and32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i32(Assembler::emit_and, loc_a, loc_b, ret);
+        self.emit_binop_i32(AssemblerX64::emit_and, loc_a, loc_b, ret);
     }
     fn emit_binop_or32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i32(Assembler::emit_or, loc_a, loc_b, ret);
+        self.emit_binop_i32(AssemblerX64::emit_or, loc_a, loc_b, ret);
     }
     fn emit_binop_xor32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i32(Assembler::emit_xor, loc_a, loc_b, ret);
+        self.emit_binop_i32(AssemblerX64::emit_xor, loc_a, loc_b, ret);
     }
     fn i32_cmp_ge_s(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
         self.emit_cmpop_i32_dynamic_b(Condition::GreaterEqual, loc_a, loc_b, ret);
@@ -2698,19 +2736,19 @@ impl Machine for MachineX86_64 {
         }
     }
     fn i32_shl(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_shift_i32(Assembler::emit_shl, loc_a, loc_b, ret);
+        self.emit_shift_i32(AssemblerX64::emit_shl, loc_a, loc_b, ret);
     }
     fn i32_shr(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_shift_i32(Assembler::emit_shr, loc_a, loc_b, ret);
+        self.emit_shift_i32(AssemblerX64::emit_shr, loc_a, loc_b, ret);
     }
     fn i32_sar(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_shift_i32(Assembler::emit_sar, loc_a, loc_b, ret);
+        self.emit_shift_i32(AssemblerX64::emit_sar, loc_a, loc_b, ret);
     }
     fn i32_rol(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_shift_i32(Assembler::emit_rol, loc_a, loc_b, ret);
+        self.emit_shift_i32(AssemblerX64::emit_rol, loc_a, loc_b, ret);
     }
     fn i32_ror(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_shift_i32(Assembler::emit_ror, loc_a, loc_b, ret);
+        self.emit_shift_i32(AssemblerX64::emit_ror, loc_a, loc_b, ret);
     }
     fn i32_load(
         &mut self,
@@ -2733,7 +2771,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S32,
                     Location::Memory(addr, 0),
                     ret,
@@ -2762,7 +2800,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
-                    Assembler::emit_movzx,
+                    AssemblerX64::emit_movzx,
                     Size::S8,
                     Location::Memory(addr, 0),
                     Size::S32,
@@ -2792,7 +2830,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
-                    Assembler::emit_movsx,
+                    AssemblerX64::emit_movsx,
                     Size::S8,
                     Location::Memory(addr, 0),
                     Size::S32,
@@ -2822,7 +2860,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
-                    Assembler::emit_movzx,
+                    AssemblerX64::emit_movzx,
                     Size::S16,
                     Location::Memory(addr, 0),
                     Size::S32,
@@ -2852,7 +2890,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
-                    Assembler::emit_movsx,
+                    AssemblerX64::emit_movsx,
                     Size::S16,
                     Location::Memory(addr, 0),
                     Size::S32,
@@ -2964,7 +3002,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S32,
                     target_value,
                     Location::Memory(addr, 0),
@@ -2993,7 +3031,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S8,
                     target_value,
                     Location::Memory(addr, 0),
@@ -3022,7 +3060,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S16,
                     target_value,
                     Location::Memory(addr, 0),
@@ -3855,13 +3893,13 @@ impl Machine for MachineX86_64 {
     }
 
     fn emit_binop_add64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i64(Assembler::emit_add, loc_a, loc_b, ret);
+        self.emit_binop_i64(AssemblerX64::emit_add, loc_a, loc_b, ret);
     }
     fn emit_binop_sub64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i64(Assembler::emit_sub, loc_a, loc_b, ret);
+        self.emit_binop_i64(AssemblerX64::emit_sub, loc_a, loc_b, ret);
     }
     fn emit_binop_mul64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i64(Assembler::emit_imul, loc_a, loc_b, ret);
+        self.emit_binop_i64(AssemblerX64::emit_imul, loc_a, loc_b, ret);
     }
     fn emit_binop_udiv64(
         &mut self,
@@ -3877,7 +3915,7 @@ impl Machine for MachineX86_64 {
         self.assembler
             .emit_xor(Size::S64, Location::GPR(GPR::RDX), Location::GPR(GPR::RDX));
         let offset = self.emit_relaxed_xdiv(
-            Assembler::emit_div,
+            AssemblerX64::emit_div,
             Size::S64,
             loc_b,
             integer_division_by_zero,
@@ -3899,7 +3937,7 @@ impl Machine for MachineX86_64 {
             .emit_mov(Size::S64, loc_a, Location::GPR(GPR::RAX));
         self.assembler.emit_cqo();
         let offset = self.emit_relaxed_xdiv(
-            Assembler::emit_idiv,
+            AssemblerX64::emit_idiv,
             Size::S64,
             loc_b,
             integer_division_by_zero,
@@ -3922,7 +3960,7 @@ impl Machine for MachineX86_64 {
         self.assembler
             .emit_xor(Size::S64, Location::GPR(GPR::RDX), Location::GPR(GPR::RDX));
         let offset = self.emit_relaxed_xdiv(
-            Assembler::emit_div,
+            AssemblerX64::emit_div,
             Size::S64,
             loc_b,
             integer_division_by_zero,
@@ -3955,7 +3993,7 @@ impl Machine for MachineX86_64 {
             .emit_mov(Size::S64, loc_a, Location::GPR(GPR::RAX));
         self.assembler.emit_cqo();
         let offset = self.emit_relaxed_xdiv(
-            Assembler::emit_idiv,
+            AssemblerX64::emit_idiv,
             Size::S64,
             loc_b,
             integer_division_by_zero,
@@ -3967,13 +4005,13 @@ impl Machine for MachineX86_64 {
         offset
     }
     fn emit_binop_and64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i64(Assembler::emit_and, loc_a, loc_b, ret);
+        self.emit_binop_i64(AssemblerX64::emit_and, loc_a, loc_b, ret);
     }
     fn emit_binop_or64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i64(Assembler::emit_or, loc_a, loc_b, ret);
+        self.emit_binop_i64(AssemblerX64::emit_or, loc_a, loc_b, ret);
     }
     fn emit_binop_xor64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_binop_i64(Assembler::emit_xor, loc_a, loc_b, ret);
+        self.emit_binop_i64(AssemblerX64::emit_xor, loc_a, loc_b, ret);
     }
     fn i64_cmp_ge_s(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
         self.emit_cmpop_i64_dynamic_b(Condition::GreaterEqual, loc_a, loc_b, ret);
@@ -4139,19 +4177,19 @@ impl Machine for MachineX86_64 {
         }
     }
     fn i64_shl(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_shift_i64(Assembler::emit_shl, loc_a, loc_b, ret);
+        self.emit_shift_i64(AssemblerX64::emit_shl, loc_a, loc_b, ret);
     }
     fn i64_shr(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_shift_i64(Assembler::emit_shr, loc_a, loc_b, ret);
+        self.emit_shift_i64(AssemblerX64::emit_shr, loc_a, loc_b, ret);
     }
     fn i64_sar(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_shift_i64(Assembler::emit_sar, loc_a, loc_b, ret);
+        self.emit_shift_i64(AssemblerX64::emit_sar, loc_a, loc_b, ret);
     }
     fn i64_rol(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_shift_i64(Assembler::emit_rol, loc_a, loc_b, ret);
+        self.emit_shift_i64(AssemblerX64::emit_rol, loc_a, loc_b, ret);
     }
     fn i64_ror(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_shift_i64(Assembler::emit_ror, loc_a, loc_b, ret);
+        self.emit_shift_i64(AssemblerX64::emit_ror, loc_a, loc_b, ret);
     }
     fn i64_load(
         &mut self,
@@ -4174,7 +4212,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S64,
                     Location::Memory(addr, 0),
                     ret,
@@ -4203,7 +4241,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
-                    Assembler::emit_movzx,
+                    AssemblerX64::emit_movzx,
                     Size::S8,
                     Location::Memory(addr, 0),
                     Size::S64,
@@ -4233,7 +4271,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
-                    Assembler::emit_movsx,
+                    AssemblerX64::emit_movsx,
                     Size::S8,
                     Location::Memory(addr, 0),
                     Size::S64,
@@ -4263,7 +4301,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
-                    Assembler::emit_movzx,
+                    AssemblerX64::emit_movzx,
                     Size::S16,
                     Location::Memory(addr, 0),
                     Size::S64,
@@ -4293,7 +4331,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
-                    Assembler::emit_movsx,
+                    AssemblerX64::emit_movsx,
                     Size::S16,
                     Location::Memory(addr, 0),
                     Size::S64,
@@ -4336,7 +4374,7 @@ impl Machine for MachineX86_64 {
                     }
                 }
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S32,
                     Location::Memory(addr, 0),
                     ret,
@@ -4365,7 +4403,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_zx_sx(
-                    Assembler::emit_movsx,
+                    AssemblerX64::emit_movsx,
                     Size::S32,
                     Location::Memory(addr, 0),
                     Size::S64,
@@ -4519,7 +4557,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S64,
                     target_value,
                     Location::Memory(addr, 0),
@@ -4548,7 +4586,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S8,
                     target_value,
                     Location::Memory(addr, 0),
@@ -4577,7 +4615,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S16,
                     target_value,
                     Location::Memory(addr, 0),
@@ -4606,7 +4644,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S32,
                     target_value,
                     Location::Memory(addr, 0),
@@ -5696,7 +5734,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S32,
                     Location::Memory(addr, 0),
                     ret,
@@ -5728,7 +5766,7 @@ impl Machine for MachineX86_64 {
             |this, addr| {
                 if !canonicalize {
                     this.emit_relaxed_binop(
-                        Assembler::emit_mov,
+                        AssemblerX64::emit_mov,
                         Size::S32,
                         target_value,
                         Location::Memory(addr, 0),
@@ -5760,7 +5798,7 @@ impl Machine for MachineX86_64 {
             heap_access_oob,
             |this, addr| {
                 this.emit_relaxed_binop(
-                    Assembler::emit_mov,
+                    AssemblerX64::emit_mov,
                     Size::S64,
                     Location::Memory(addr, 0),
                     ret,
@@ -5792,7 +5830,7 @@ impl Machine for MachineX86_64 {
             |this, addr| {
                 if !canonicalize {
                     this.emit_relaxed_binop(
-                        Assembler::emit_mov,
+                        AssemblerX64::emit_mov,
                         Size::S64,
                         target_value,
                         Location::Memory(addr, 0),
@@ -6014,10 +6052,10 @@ impl Machine for MachineX86_64 {
         }
     }
     fn convert_f64_f32(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcvtss2sd, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcvtss2sd, loc, loc, ret);
     }
     fn convert_f32_f64(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcvtsd2ss, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcvtsd2ss, loc, loc, ret);
     }
     fn f64_neg(&mut self, loc: Location, ret: Location) {
         if self.assembler.arch_has_fneg() {
@@ -6076,47 +6114,47 @@ impl Machine for MachineX86_64 {
         self.release_gpr(c);
     }
     fn f64_sqrt(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vsqrtsd, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vsqrtsd, loc, loc, ret);
     }
     fn f64_trunc(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vroundsd_trunc, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vroundsd_trunc, loc, loc, ret);
     }
     fn f64_ceil(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vroundsd_ceil, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vroundsd_ceil, loc, loc, ret);
     }
     fn f64_floor(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vroundsd_floor, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vroundsd_floor, loc, loc, ret);
     }
     fn f64_nearest(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vroundsd_nearest, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vroundsd_nearest, loc, loc, ret);
     }
     fn f64_cmp_ge(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpgesd, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpgesd, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f64_cmp_gt(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpgtsd, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpgtsd, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f64_cmp_le(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmplesd, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmplesd, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f64_cmp_lt(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpltsd, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpltsd, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f64_cmp_ne(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpneqsd, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpneqsd, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f64_cmp_eq(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpeqsd, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpeqsd, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f64_min(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
         if !self.arch_supports_canonicalize_nan() {
-            self.emit_relaxed_avx(Assembler::emit_vminsd, loc_a, loc_b, ret);
+            self.emit_relaxed_avx(AssemblerX64::emit_vminsd, loc_a, loc_b, ret);
         } else {
             let tmp1 = self.acquire_temp_simd().unwrap();
             let tmp2 = self.acquire_temp_simd().unwrap();
@@ -6225,7 +6263,7 @@ impl Machine for MachineX86_64 {
     }
     fn f64_max(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
         if !self.arch_supports_canonicalize_nan() {
-            self.emit_relaxed_avx(Assembler::emit_vmaxsd, loc_a, loc_b, ret);
+            self.emit_relaxed_avx(AssemblerX64::emit_vmaxsd, loc_a, loc_b, ret);
         } else {
             let tmp1 = self.acquire_temp_simd().unwrap();
             let tmp2 = self.acquire_temp_simd().unwrap();
@@ -6328,16 +6366,16 @@ impl Machine for MachineX86_64 {
         }
     }
     fn f64_add(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vaddsd, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vaddsd, loc_a, loc_b, ret);
     }
     fn f64_sub(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vsubsd, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vsubsd, loc_a, loc_b, ret);
     }
     fn f64_mul(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vmulsd, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vmulsd, loc_a, loc_b, ret);
     }
     fn f64_div(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vdivsd, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vdivsd, loc_a, loc_b, ret);
     }
     fn f32_neg(&mut self, loc: Location, ret: Location) {
         if self.assembler.arch_has_fneg() {
@@ -6380,47 +6418,47 @@ impl Machine for MachineX86_64 {
             .emit_or(Size::S32, Location::GPR(tmp2), Location::GPR(tmp1));
     }
     fn f32_sqrt(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vsqrtss, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vsqrtss, loc, loc, ret);
     }
     fn f32_trunc(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vroundss_trunc, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vroundss_trunc, loc, loc, ret);
     }
     fn f32_ceil(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vroundss_ceil, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vroundss_ceil, loc, loc, ret);
     }
     fn f32_floor(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vroundss_floor, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vroundss_floor, loc, loc, ret);
     }
     fn f32_nearest(&mut self, loc: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vroundss_nearest, loc, loc, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vroundss_nearest, loc, loc, ret);
     }
     fn f32_cmp_ge(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpgess, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpgess, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f32_cmp_gt(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpgtss, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpgtss, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f32_cmp_le(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpless, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpless, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f32_cmp_lt(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpltss, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpltss, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f32_cmp_ne(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpneqss, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpneqss, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f32_cmp_eq(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vcmpeqss, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vcmpeqss, loc_a, loc_b, ret);
         self.assembler.emit_and(Size::S32, Location::Imm32(1), ret);
     }
     fn f32_min(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
         if !self.arch_supports_canonicalize_nan() {
-            self.emit_relaxed_avx(Assembler::emit_vminss, loc_a, loc_b, ret);
+            self.emit_relaxed_avx(AssemblerX64::emit_vminss, loc_a, loc_b, ret);
         } else {
             let tmp1 = self.acquire_temp_simd().unwrap();
             let tmp2 = self.acquire_temp_simd().unwrap();
@@ -6529,7 +6567,7 @@ impl Machine for MachineX86_64 {
     }
     fn f32_max(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
         if !self.arch_supports_canonicalize_nan() {
-            self.emit_relaxed_avx(Assembler::emit_vmaxss, loc_a, loc_b, ret);
+            self.emit_relaxed_avx(AssemblerX64::emit_vmaxss, loc_a, loc_b, ret);
         } else {
             let tmp1 = self.acquire_temp_simd().unwrap();
             let tmp2 = self.acquire_temp_simd().unwrap();
@@ -6632,16 +6670,16 @@ impl Machine for MachineX86_64 {
         }
     }
     fn f32_add(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vaddss, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vaddss, loc_a, loc_b, ret);
     }
     fn f32_sub(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vsubss, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vsubss, loc_a, loc_b, ret);
     }
     fn f32_mul(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vmulss, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vmulss, loc_a, loc_b, ret);
     }
     fn f32_div(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
-        self.emit_relaxed_avx(Assembler::emit_vdivss, loc_a, loc_b, ret);
+        self.emit_relaxed_avx(AssemblerX64::emit_vdivss, loc_a, loc_b, ret);
     }
 
     fn gen_std_trampoline(
@@ -6649,7 +6687,8 @@ impl Machine for MachineX86_64 {
         sig: &FunctionType,
         calling_convention: CallingConvention,
     ) -> FunctionBody {
-        let mut a = Assembler::new(0);
+        // the cpu feature here is irrelevant
+        let mut a = AssemblerX64::new(0, None);
 
         // Calculate stack offset.
         let mut stack_offset: u32 = 0;
@@ -6761,7 +6800,8 @@ impl Machine for MachineX86_64 {
         sig: &FunctionType,
         calling_convention: CallingConvention,
     ) -> FunctionBody {
-        let mut a = Assembler::new(0);
+        // the cpu feature here is irrelevant
+        let mut a = AssemblerX64::new(0, None);
 
         // Allocate argument array.
         let stack_offset: usize = 16 * std::cmp::max(sig.params().len(), sig.results().len()) + 8; // 16 bytes each + 8 bytes sysv call padding
@@ -6883,7 +6923,8 @@ impl Machine for MachineX86_64 {
         sig: &FunctionType,
         calling_convention: CallingConvention,
     ) -> CustomSection {
-        let mut a = Assembler::new(0);
+        // the cpu feature here is irrelevant
+        let mut a = AssemblerX64::new(0, None);
 
         // TODO: ARM entry trampoline is not emitted.