diff --git a/CHANGELOG.md b/CHANGELOG.md
index 589142e77d2..71d9a0c2511 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 Looking for changes that affect our C API? See the [C API Changelog](lib/c-api/CHANGELOG.md).
 
 ## **[Unreleased]**
+- [#2750](https://github.com/wasmerio/wasmer/pull/2750) Added Aarch64 support to Singlepass (both Linux and macOS).
 
 ## 2.1.1 - 2021/12/20
 
diff --git a/Makefile b/Makefile
index 4d49156fba4..2d2fa4c5f09 100644
--- a/Makefile
+++ b/Makefile
@@ -179,6 +179,11 @@ ifneq ($(ENABLE_SINGLEPASS), 0)
 		ifeq ($(IS_AMD64), 1)
 			compilers += singlepass
 		endif
+		ifeq ($(IS_AARCH64), 1)
+			ifneq ($(IS_WINDOWS), 1)
+				compilers += singlepass
+			endif
+		endif
 	endif
 endif
 
@@ -249,6 +254,9 @@ ifeq ($(ENABLE_SINGLEPASS), 1)
 		ifeq ($(IS_AMD64), 1)
 			compilers_engines += singlepass-universal
 		endif
+		ifeq ($(IS_AARCH64), 1)
+			compilers_engines += singlepass-universal
+		endif
 	endif
 endif
 
diff --git a/lib/compiler-singlepass/Cargo.toml b/lib/compiler-singlepass/Cargo.toml
index 18c8a3bdd29..029c3adde2e 100644
--- a/lib/compiler-singlepass/Cargo.toml
+++ b/lib/compiler-singlepass/Cargo.toml
@@ -18,8 +18,8 @@ wasmer-types = { path = "../types", version = "=2.1.1", default-features = false
 rayon = { version = "1.5", optional = true }
 hashbrown = { version = "0.11", optional = true }
 more-asserts = "0.2"
-dynasm = "1.2"
-dynasmrt = "1.2"
+dynasm = "1.2.1"
+dynasmrt = "1.2.1"
 lazy_static = "1.4"
 byteorder = "1.3"
 smallvec = "1.6"
diff --git a/lib/compiler-singlepass/src/arm64_decl.rs b/lib/compiler-singlepass/src/arm64_decl.rs
new file mode 100644
index 00000000000..400f705daaf
--- /dev/null
+++ b/lib/compiler-singlepass/src/arm64_decl.rs
@@ -0,0 +1,302 @@
+//! ARM64 structures.
+
+use crate::common_decl::{MachineState, MachineValue, RegisterIndex};
+use crate::location::CombinedRegister;
+use crate::location::Reg as AbstractReg;
+use std::collections::BTreeMap;
+use wasmer_compiler::CallingConvention;
+use wasmer_types::Type;
+
+/// General-purpose registers.
+#[repr(u8)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
+pub enum GPR {
+    X0 = 0,
+    X1 = 1,
+    X2 = 2,
+    X3 = 3,
+    X4 = 4,
+    X5 = 5,
+    X6 = 6,
+    X7 = 7,
+    X8 = 8,
+    X9 = 9,
+    X10 = 10,
+    X11 = 11,
+    X12 = 12,
+    X13 = 13,
+    X14 = 14,
+    X15 = 15,
+    X16 = 16,
+    X17 = 17,
+    X18 = 18,
+    X19 = 19,
+    X20 = 20,
+    X21 = 21,
+    X22 = 22,
+    X23 = 23,
+    X24 = 24,
+    X25 = 25,
+    X26 = 26,
+    X27 = 27,
+    X28 = 28,
+    X29 = 29,
+    X30 = 30,
+    XzrSp = 31,
+}
+
+/// NEON registers.
+#[repr(u8)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
+#[allow(dead_code)]
+pub enum NEON {
+    V0 = 0,
+    V1 = 1,
+    V2 = 2,
+    V3 = 3,
+    V4 = 4,
+    V5 = 5,
+    V6 = 6,
+    V7 = 7,
+    V8 = 8,
+    V9 = 9,
+    V10 = 10,
+    V11 = 11,
+    V12 = 12,
+    V13 = 13,
+    V14 = 14,
+    V15 = 15,
+    V16 = 16,
+    V17 = 17,
+    V18 = 18,
+    V19 = 19,
+    V20 = 20,
+    V21 = 21,
+    V22 = 22,
+    V23 = 23,
+    V24 = 24,
+    V25 = 25,
+    V26 = 26,
+    V27 = 27,
+    V28 = 28,
+    V29 = 29,
+    V30 = 30,
+    V31 = 31,
+}
+
+impl AbstractReg for GPR {
+    fn is_callee_save(self) -> bool {
+        self as usize > 18
+    }
+    fn is_reserved(self) -> bool {
+        match self.into_index() {
+            0..=16 | 19..=27 => false,
+            _ => true,
+        }
+    }
+    fn into_index(self) -> usize {
+        self as usize
+    }
+    fn from_index(n: usize) -> Result<GPR, ()> {
+        const REGS: [GPR; 32] = [
+            GPR::X0,
+            GPR::X1,
+            GPR::X2,
+            GPR::X3,
+            GPR::X4,
+            GPR::X5,
+            GPR::X6,
+            GPR::X7,
+            GPR::X8,
+            GPR::X9,
+            GPR::X10,
+            GPR::X11,
+            GPR::X12,
+            GPR::X13,
+            GPR::X14,
+            GPR::X15,
+            GPR::X16,
+            GPR::X17,
+            GPR::X18,
+            GPR::X19,
+            GPR::X20,
+            GPR::X21,
+            GPR::X22,
+            GPR::X23,
+            GPR::X24,
+            GPR::X25,
+            GPR::X26,
+            GPR::X27,
+            GPR::X28,
+            GPR::X29,
+            GPR::X30,
+            GPR::XzrSp,
+        ];
+        REGS.get(n).cloned().ok_or(())
+    }
+}
+
+impl AbstractReg for NEON {
+    fn is_callee_save(self) -> bool {
+        self as usize > 16
+    }
+    fn is_reserved(self) -> bool {
+        false
+    }
+    fn into_index(self) -> usize {
+        self as usize
+    }
+    fn from_index(n: usize) -> Result<NEON, ()> {
+        const REGS: [NEON; 32] = [
+            NEON::V0,
+            NEON::V1,
+            NEON::V2,
+            NEON::V3,
+            NEON::V4,
+            NEON::V5,
+            NEON::V6,
+            NEON::V7,
+            NEON::V8,
+            NEON::V9,
+            NEON::V10,
+            NEON::V11,
+            NEON::V12,
+            NEON::V13,
+            NEON::V14,
+            NEON::V15,
+            NEON::V16,
+            NEON::V17,
+            NEON::V18,
+            NEON::V19,
+            NEON::V20,
+            NEON::V21,
+            NEON::V22,
+            NEON::V23,
+            NEON::V24,
+            NEON::V25,
+            NEON::V26,
+            NEON::V27,
+            NEON::V28,
+            NEON::V29,
+            NEON::V30,
+            NEON::V31,
+        ];
+        match n {
+            0..=31 => Ok(REGS[n]),
+            _ => Err(()),
+        }
+    }
+}
+
+/// A machine register under the x86-64 architecture.
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+pub enum ARM64Register {
+    /// General-purpose registers.
+    GPR(GPR),
+    /// NEON (floating point/SIMD) registers.
+    NEON(NEON),
+}
+
+impl CombinedRegister for ARM64Register {
+    /// Returns the index of the register.
+    fn to_index(&self) -> RegisterIndex {
+        match *self {
+            ARM64Register::GPR(x) => RegisterIndex(x as usize),
+            ARM64Register::NEON(x) => RegisterIndex(x as usize + 64),
+        }
+    }
+    /// Convert from a GPR register
+    fn from_gpr(x: u16) -> Self {
+        ARM64Register::GPR(GPR::from_index(x as usize).unwrap())
+    }
+    /// Convert from an SIMD register
+    fn from_simd(x: u16) -> Self {
+        ARM64Register::NEON(NEON::from_index(x as usize).unwrap())
+    }
+
+    /// Converts a DWARF regnum to ARM64Register.
+    fn _from_dwarf_regnum(x: u16) -> Option<ARM64Register> {
+        Some(match x {
+            0..=31 => ARM64Register::GPR(GPR::from_index(x as usize).unwrap()),
+            64..=95 => ARM64Register::NEON(NEON::from_index(x as usize - 64).unwrap()),
+            _ => return None,
+        })
+    }
+}
+
+/// An allocator that allocates registers for function arguments according to the System V ABI.
+#[derive(Default)]
+pub struct ArgumentRegisterAllocator {
+    n_gprs: usize,
+    n_neons: usize,
+}
+
+impl ArgumentRegisterAllocator {
+    /// Allocates a register for argument type `ty`. Returns `None` if no register is available for this type.
+    pub fn next(
+        &mut self,
+        ty: Type,
+        calling_convention: CallingConvention,
+    ) -> Option<ARM64Register> {
+        match calling_convention {
+            CallingConvention::SystemV | CallingConvention::AppleAarch64 => {
+                static GPR_SEQ: &'static [GPR] = &[
+                    GPR::X0,
+                    GPR::X1,
+                    GPR::X2,
+                    GPR::X3,
+                    GPR::X4,
+                    GPR::X5,
+                    GPR::X6,
+                    GPR::X7,
+                ];
+                static NEON_SEQ: &'static [NEON] = &[
+                    NEON::V0,
+                    NEON::V1,
+                    NEON::V2,
+                    NEON::V3,
+                    NEON::V4,
+                    NEON::V5,
+                    NEON::V6,
+                    NEON::V7,
+                ];
+                match ty {
+                    Type::I32 | Type::I64 => {
+                        if self.n_gprs < GPR_SEQ.len() {
+                            let gpr = GPR_SEQ[self.n_gprs];
+                            self.n_gprs += 1;
+                            Some(ARM64Register::GPR(gpr))
+                        } else {
+                            None
+                        }
+                    }
+                    Type::F32 | Type::F64 => {
+                        if self.n_neons < NEON_SEQ.len() {
+                            let neon = NEON_SEQ[self.n_neons];
+                            self.n_neons += 1;
+                            Some(ARM64Register::NEON(neon))
+                        } else {
+                            None
+                        }
+                    }
+                    _ => todo!(
+                        "ArgumentRegisterAllocator::next: Unsupported type: {:?}",
+                        ty
+                    ),
+                }
+            }
+            _ => unimplemented!(),
+        }
+    }
+}
+
+/// Create a new `MachineState` with default values.
+pub fn new_machine_state() -> MachineState {
+    MachineState {
+        stack_values: vec![],
+        register_values: vec![MachineValue::Undefined; 32 + 32],
+        prev_frame: BTreeMap::new(),
+        wasm_stack: vec![],
+        wasm_inst_offset: std::usize::MAX,
+    }
+}
diff --git a/lib/compiler-singlepass/src/codegen.rs b/lib/compiler-singlepass/src/codegen.rs
index 6bdbc4375b0..c9f98b30b64 100644
--- a/lib/compiler-singlepass/src/codegen.rs
+++ b/lib/compiler-singlepass/src/codegen.rs
@@ -85,6 +85,7 @@ pub struct FuncGen<'a, M: Machine> {
 
 struct SpecialLabelSet {
     integer_division_by_zero: Label,
+    integer_overflow: Label,
     heap_access_oob: Label,
     table_access_oob: Label,
     indirect_call_null: Label,
@@ -290,6 +291,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             ret.push(loc);
         }
 
+        let delta_stack_offset = self.machine.round_stack_adjust(delta_stack_offset);
         if delta_stack_offset != 0 {
             self.machine.adjust_stack(delta_stack_offset as u32);
         }
@@ -335,7 +337,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             }
             self.state.wasm_stack.pop().unwrap();
         }
-
+        let delta_stack_offset = self.machine.round_stack_adjust(delta_stack_offset);
         if delta_stack_offset != 0 {
             self.machine.restore_stack(delta_stack_offset as u32);
         }
@@ -376,6 +378,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             self.state.wasm_stack.pop().unwrap();
         }
 
+        let delta_stack_offset = self.machine.round_stack_adjust(delta_stack_offset);
         if delta_stack_offset != 0 {
             self.machine.adjust_stack(delta_stack_offset as u32);
         }
@@ -421,6 +424,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             // Wasm state popping is deferred to `release_locations_only_osr_state`.
         }
 
+        let delta_stack_offset = self.machine.round_stack_adjust(delta_stack_offset);
         if delta_stack_offset != 0 {
             self.machine.pop_stack_locals(delta_stack_offset as u32);
         }
@@ -457,6 +461,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             }
         }
 
+        let delta_stack_offset = self.machine.round_stack_adjust(delta_stack_offset);
         if delta_stack_offset != 0 {
             self.machine.pop_stack_locals(delta_stack_offset as u32);
         }
@@ -465,7 +470,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
     fn init_locals(
         &mut self,
         n: usize,
-        n_params: usize,
+        sig: FunctionType,
         calling_convention: CallingConvention,
     ) -> Vec<Location<M::GPR, M::SIMD>> {
         // How many machine stack slots will all the locals use?
@@ -486,7 +491,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             }
         }
 
-        // Callee-saved R15 for vmctx.
+        // Callee-saved vmctx.
         static_area_size += 8;
 
         // Some ABI (like Windows) needs extrat reg save
@@ -504,6 +509,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
         static_area_size += num_mem_slots * 8;
 
         // Allocate save area, without actually writing to it.
+        static_area_size = self.machine.round_stack_adjust(static_area_size);
         self.machine.adjust_stack(static_area_size as _);
 
         // Save callee-saved registers.
@@ -517,7 +523,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             }
         }
 
-        // Save R15 for vmctx use.
+        // Save the Reg use for vmctx.
         self.stack_offset.0 += 8;
         self.machine.move_local(
             self.stack_offset.0 as i32,
@@ -554,15 +560,29 @@ impl<'a, M: Machine> FuncGen<'a, M> {
         // Load in-register parameters into the allocated locations.
         // Locals are allocated on the stack from higher address to lower address,
         // so we won't skip the stack guard page here.
-        for i in 0..n_params {
-            let loc = self.machine.get_param_location(i + 1, calling_convention);
-            self.machine.move_location(Size::S64, loc, locations[i]);
+        let mut stack_offset: usize = 0;
+        for (i, param) in sig.params().iter().enumerate() {
+            let sz = match *param {
+                Type::I32 | Type::F32 => Size::S32,
+                Type::I64 | Type::F64 => Size::S64,
+                Type::ExternRef | Type::FuncRef => Size::S64,
+                _ => unimplemented!(),
+            };
+            let loc = self.machine.get_call_param_location(
+                i + 1,
+                sz,
+                &mut stack_offset,
+                calling_convention,
+            );
+            self.machine
+                .move_location_extend(sz, false, loc, Size::S64, locations[i]);
         }
 
-        // Load vmctx into R15.
+        // Load vmctx into it's GPR.
         self.machine.move_location(
             Size::S64,
-            self.machine.get_param_location(0, calling_convention),
+            self.machine
+                .get_simple_param_location(0, calling_convention),
             Location::GPR(self.machine.get_vmctx_reg()),
         );
 
@@ -570,14 +590,17 @@ impl<'a, M: Machine> FuncGen<'a, M> {
         //
         // `rep stosq` writes data from low address to high address and may skip the stack guard page.
         // so here we probe it explicitly when needed.
-        for i in (n_params..n).step_by(NATIVE_PAGE_SIZE / 8).skip(1) {
+        for i in (sig.params().len()..n)
+            .step_by(NATIVE_PAGE_SIZE / 8)
+            .skip(1)
+        {
             self.machine.zero_location(Size::S64, locations[i]);
         }
 
         // Initialize all normal locals to zero.
         let mut init_stack_loc_cnt = 0;
         let mut last_stack_loc = Location::Memory(self.machine.local_pointer(), i32::MAX);
-        for i in n_params..n {
+        for i in sig.params().len()..n {
             match locations[i] {
                 Location::Memory(_, _) => {
                     init_stack_loc_cnt += 1;
@@ -687,24 +710,34 @@ impl<'a, M: Machine> FuncGen<'a, M> {
         );
     }
 
-    /// Emits a System V / Windows call sequence.
-    ///
-    /// This function will not use RAX before `cb` is called.
+    /// Emits a Native ABI call sequence.
     ///
     /// The caller MUST NOT hold any temporary registers allocated by `acquire_temp_gpr` when calling
     /// this function.
-    fn emit_call_native<I: Iterator<Item = Location<M::GPR, M::SIMD>>, F: FnOnce(&mut Self)>(
+    fn emit_call_native<
+        I: Iterator<Item = Location<M::GPR, M::SIMD>>,
+        J: Iterator<Item = WpType>,
+        F: FnOnce(&mut Self),
+    >(
         &mut self,
         cb: F,
         params: I,
+        params_type: J,
     ) -> Result<(), CodegenError> {
         // Values pushed in this function are above the shadow region.
         self.state.stack_values.push(MachineValue::ExplicitShadow);
 
         let params: Vec<_> = params.collect();
+        let params_size: Vec<_> = params_type
+            .map(|x| match x {
+                WpType::F32 | WpType::I32 => Size::S32,
+                WpType::V128 => unimplemented!(),
+                _ => Size::S64,
+            })
+            .collect();
 
-        // Save used GPRs.
-        self.machine.push_used_gpr();
+        // Save used GPRs. Preserve correct stack alignment
+        let mut used_stack = self.machine.push_used_gpr();
         let used_gprs = self.machine.get_used_gprs();
         for r in used_gprs.iter() {
             let content = self.state.register_values[self.machine.index_from_gpr(*r).0].clone();
@@ -716,10 +749,10 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             self.state.stack_values.push(content);
         }
 
-        // Save used XMM registers.
+        // Save used SIMD registers.
         let used_simds = self.machine.get_used_simd();
         if used_simds.len() > 0 {
-            self.machine.push_used_simd();
+            used_stack += self.machine.push_used_simd();
 
             for r in used_simds.iter().rev() {
                 let content =
@@ -732,6 +765,10 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 self.state.stack_values.push(content);
             }
         }
+        // mark the GPR used for Call as used
+        self.machine
+            .reserve_unused_temp_gpr(self.machine.get_grp_for_call());
+
         let calling_convention = self.calling_convention;
 
         let stack_padding: usize = match calling_convention {
@@ -740,35 +777,37 @@ impl<'a, M: Machine> FuncGen<'a, M> {
         };
 
         let mut stack_offset: usize = 0;
-
+        let mut args: Vec<Location<M::GPR, M::SIMD>> = vec![];
+        let mut pushed_args: usize = 0;
         // Calculate stack offset.
         for (i, _param) in params.iter().enumerate() {
-            if let Location::Memory(_, _) =
-                self.machine.get_param_location(1 + i, calling_convention)
-            {
-                stack_offset += 8;
-            }
+            args.push(self.machine.get_param_location(
+                1 + i,
+                params_size[i],
+                &mut stack_offset,
+                calling_convention,
+            ));
         }
 
         // Align stack to 16 bytes.
-        if (self.get_stack_offset() + used_gprs.len() * 8 + used_simds.len() * 8 + stack_offset)
-            % 16
-            != 0
-        {
-            self.machine.adjust_stack(8);
-            stack_offset += 8;
-            self.state.stack_values.push(MachineValue::Undefined);
+        let stack_unaligned =
+            (self.machine.round_stack_adjust(self.get_stack_offset()) + used_stack + stack_offset)
+                % 16;
+        if stack_unaligned != 0 {
+            stack_offset += 16 - stack_unaligned;
         }
+        self.machine.adjust_stack(stack_offset as u32);
 
         let mut call_movs: Vec<(Location<M::GPR, M::SIMD>, M::GPR)> = vec![];
         // Prepare register & stack parameters.
         for (i, param) in params.iter().enumerate().rev() {
-            let loc = self.machine.get_param_location(1 + i, calling_convention);
+            let loc = args[i];
             match loc {
                 Location::GPR(x) => {
                     call_movs.push((*param, x));
                 }
                 Location::Memory(_, _) => {
+                    pushed_args += 1;
                     match *param {
                         Location::GPR(x) => {
                             let content = self.state.register_values
@@ -803,7 +842,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                             self.state.stack_values.push(MachineValue::Undefined);
                         }
                     }
-                    self.machine.push_location_for_native(*param);
+                    self.machine.move_location(params_size[i], *param, loc);
                 }
                 _ => {
                     return Err(CodegenError {
@@ -828,19 +867,15 @@ impl<'a, M: Machine> FuncGen<'a, M> {
         self.machine.move_location(
             Size::S64,
             Location::GPR(self.machine.get_vmctx_reg()),
-            self.machine.get_param_location(0, calling_convention),
+            self.machine
+                .get_simple_param_location(0, calling_convention),
         ); // vmctx
 
-        if (self.state.stack_values.len() % 2) != 1 {
-            return Err(CodegenError {
-                message: "emit_call_native: explicit shadow takes one slot".to_string(),
-            });
-        }
-
         if stack_padding > 0 {
             self.machine.adjust_stack(stack_padding as u32);
         }
-
+        // release the GPR used for call
+        self.machine.release_gpr(self.machine.get_grp_for_call());
         cb(self);
 
         // Offset needs to be after the 'call' instruction.
@@ -863,19 +898,21 @@ impl<'a, M: Machine> FuncGen<'a, M> {
 
         // Restore stack.
         if stack_offset + stack_padding > 0 {
-            self.machine
-                .restore_stack((stack_offset + stack_padding) as u32);
+            self.machine.restore_stack(
+                self.machine
+                    .round_stack_adjust(stack_offset + stack_padding) as u32,
+            );
             if (stack_offset % 8) != 0 {
                 return Err(CodegenError {
                     message: "emit_call_native: Bad restoring stack alignement".to_string(),
                 });
             }
-            for _ in 0..stack_offset / 8 {
+            for _ in 0..pushed_args {
                 self.state.stack_values.pop().unwrap();
             }
         }
 
-        // Restore XMMs.
+        // Restore SIMDs.
         if !used_simds.is_empty() {
             self.machine.pop_used_simd();
             for _ in 0..used_simds.len() {
@@ -897,13 +934,21 @@ impl<'a, M: Machine> FuncGen<'a, M> {
         Ok(())
     }
 
-    /// Emits a System V call sequence, specialized for labels as the call target.
-    fn _emit_call_native_label<I: Iterator<Item = Location<M::GPR, M::SIMD>>>(
+    /// Emits a Native ABI call sequence, specialized for labels as the call target.
+    fn _emit_call_native_label<
+        I: Iterator<Item = Location<M::GPR, M::SIMD>>,
+        J: Iterator<Item = WpType>,
+    >(
         &mut self,
         label: Label,
         params: I,
+        params_type: J,
     ) -> Result<(), CodegenError> {
-        self.emit_call_native(|this| this.machine.emit_call_label(label), params)?;
+        self.emit_call_native(
+            |this| this.machine.emit_call_label(label),
+            params,
+            params_type,
+        )?;
         Ok(())
     }
 
@@ -945,15 +990,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
     }
 
     fn emit_head(&mut self) -> Result<(), CodegenError> {
-        // TODO: Patchpoint is not emitted for now, and ARM trampoline is not prepended.
-
-        // Normal x86 entry prologue.
         self.machine.emit_function_prolog();
 
         // Initialize locals.
         self.locals = self.init_locals(
             self.local_types.len(),
-            self.signature.params().len(),
+            self.signature.clone(),
             self.calling_convention,
         );
 
@@ -1024,6 +1066,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
         let mut machine = machine;
         let special_labels = SpecialLabelSet {
             integer_division_by_zero: machine.get_label(),
+            integer_overflow: machine.get_label(),
             heap_access_oob: machine.get_label(),
             table_access_oob: machine.get_label(),
             indirect_call_null: machine.get_label(),
@@ -1295,6 +1338,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     loc_b,
                     ret,
                     self.special_labels.integer_division_by_zero,
+                    self.special_labels.integer_overflow,
                 );
                 self.mark_offset_trappable(offset);
             }
@@ -1305,6 +1349,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     loc_b,
                     ret,
                     self.special_labels.integer_division_by_zero,
+                    self.special_labels.integer_overflow,
                 );
                 self.mark_offset_trappable(offset);
             }
@@ -1315,6 +1360,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     loc_b,
                     ret,
                     self.special_labels.integer_division_by_zero,
+                    self.special_labels.integer_overflow,
                 );
                 self.mark_offset_trappable(offset);
             }
@@ -1325,6 +1371,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     loc_b,
                     ret,
                     self.special_labels.integer_division_by_zero,
+                    self.special_labels.integer_overflow,
                 );
                 self.mark_offset_trappable(offset);
             }
@@ -1454,46 +1501,46 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                 self.machine.emit_binop_mul64(loc_a, loc_b, ret);
             }
             Operator::I64DivU => {
-                // We assume that RAX and RDX are temporary registers here.
                 let I2O1 { loc_a, loc_b, ret } = self.i2o1_prepare(WpType::I64);
                 let offset = self.machine.emit_binop_udiv64(
                     loc_a,
                     loc_b,
                     ret,
                     self.special_labels.integer_division_by_zero,
+                    self.special_labels.integer_overflow,
                 );
                 self.mark_offset_trappable(offset);
             }
             Operator::I64DivS => {
-                // We assume that RAX and RDX are temporary registers here.
                 let I2O1 { loc_a, loc_b, ret } = self.i2o1_prepare(WpType::I64);
                 let offset = self.machine.emit_binop_sdiv64(
                     loc_a,
                     loc_b,
                     ret,
                     self.special_labels.integer_division_by_zero,
+                    self.special_labels.integer_overflow,
                 );
                 self.mark_offset_trappable(offset);
             }
             Operator::I64RemU => {
-                // We assume that RAX and RDX are temporary registers here.
                 let I2O1 { loc_a, loc_b, ret } = self.i2o1_prepare(WpType::I64);
                 let offset = self.machine.emit_binop_urem64(
                     loc_a,
                     loc_b,
                     ret,
                     self.special_labels.integer_division_by_zero,
+                    self.special_labels.integer_overflow,
                 );
                 self.mark_offset_trappable(offset);
             }
             Operator::I64RemS => {
-                // We assume that RAX and RDX are temporary registers here.
                 let I2O1 { loc_a, loc_b, ret } = self.i2o1_prepare(WpType::I64);
                 let offset = self.machine.emit_binop_srem64(
                     loc_a,
                     loc_b,
                     ret,
                     self.special_labels.integer_division_by_zero,
+                    self.special_labels.integer_overflow,
                 );
                 self.mark_offset_trappable(offset);
             }
@@ -2563,6 +2610,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                         this.machine.mark_instruction_address_end(offset);
                     },
                     params.iter().copied(),
+                    param_types.iter().copied(),
                 )?;
 
                 self.release_locations_only_stack(&params);
@@ -2774,7 +2822,8 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                                     gpr_for_call,
                                     vmcaller_checked_anyfunc_vmctx as i32,
                                 ),
-                                this.machine.get_param_location(0, calling_convention),
+                                this.machine
+                                    .get_simple_param_location(0, calling_convention),
                             );
 
                             this.machine.emit_call_location(Location::Memory(
@@ -2785,6 +2834,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                         }
                     },
                     params.iter().copied(),
+                    param_types.iter().copied(),
                 )?;
 
                 self.release_locations_only_stack(&params);
@@ -3018,6 +3068,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     },
                     // [vmctx, memory_index]
                     iter::once(Location::Imm32(memory_index.index() as u32)),
+                    iter::once(WpType::I64),
                 )?;
                 let ret = self.acquire_locations(
                     &[(WpType::I64, MachineValue::WasmStack(self.value_stack.len()))],
@@ -3065,6 +3116,15 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     ]
                     .iter()
                     .cloned(),
+                    [
+                        WpType::I64,
+                        WpType::I64,
+                        WpType::I64,
+                        WpType::I64,
+                        WpType::I64,
+                    ]
+                    .iter()
+                    .cloned(),
                 )?;
                 self.release_locations_only_stack(&[dst, src, len]);
             }
@@ -3087,6 +3147,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     },
                     // [vmctx, segment_index]
                     iter::once(Location::Imm32(segment)),
+                    iter::once(WpType::I64),
                 )?;
             }
             Operator::MemoryCopy { src, dst } => {
@@ -3137,6 +3198,9 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     ]
                     .iter()
                     .cloned(),
+                    [WpType::I32, WpType::I64, WpType::I64, WpType::I64]
+                        .iter()
+                        .cloned(),
                 )?;
                 self.release_locations_only_stack(&[dst_pos, src_pos, len]);
             }
@@ -3181,6 +3245,9 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     [Location::Imm32(memory_index.index() as u32), dst, val, len]
                         .iter()
                         .cloned(),
+                    [WpType::I32, WpType::I64, WpType::I64, WpType::I64]
+                        .iter()
+                        .cloned(),
                 )?;
                 self.release_locations_only_stack(&[dst, val, len]);
             }
@@ -3215,6 +3282,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     // [vmctx, val, memory_index]
                     iter::once(param_pages)
                         .chain(iter::once(Location::Imm32(memory_index.index() as u32))),
+                    [WpType::I64, WpType::I64].iter().cloned(),
                 )?;
 
                 self.release_locations_only_stack(&[param_pages]);
@@ -5412,6 +5480,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     },
                     // [vmctx, func_index] -> funcref
                     iter::once(Location::Imm32(function_index as u32)),
+                    iter::once(WpType::I64),
                 )?;
 
                 let ret = self.acquire_locations(
@@ -5470,6 +5539,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     [Location::Imm32(table_index.index() as u32), index, value]
                         .iter()
                         .cloned(),
+                    [WpType::I32, WpType::I64, WpType::I64].iter().cloned(),
                 )?;
 
                 self.release_locations_only_stack(&[index, value]);
@@ -5504,6 +5574,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     [Location::Imm32(table_index.index() as u32), index]
                         .iter()
                         .cloned(),
+                    [WpType::I32, WpType::I64].iter().cloned(),
                 )?;
 
                 self.release_locations_only_stack(&[index]);
@@ -5547,6 +5618,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     },
                     // [vmctx, table_index] -> i32
                     iter::once(Location::Imm32(table_index.index() as u32)),
+                    iter::once(WpType::I32),
                 )?;
 
                 let ret = self.acquire_locations(
@@ -5596,6 +5668,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     ]
                     .iter()
                     .cloned(),
+                    [WpType::I64, WpType::I64, WpType::I64].iter().cloned(),
                 )?;
 
                 self.release_locations_only_stack(&[init_value, delta]);
@@ -5648,6 +5721,15 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     ]
                     .iter()
                     .cloned(),
+                    [
+                        WpType::I32,
+                        WpType::I32,
+                        WpType::I64,
+                        WpType::I64,
+                        WpType::I64,
+                    ]
+                    .iter()
+                    .cloned(),
                 )?;
 
                 self.release_locations_only_stack(&[dest, src, len]);
@@ -5679,6 +5761,9 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     },
                     // [vmctx, table_index, start_idx, item, len]
                     [Location::Imm32(table), dest, val, len].iter().cloned(),
+                    [WpType::I32, WpType::I64, WpType::I64, WpType::I64]
+                        .iter()
+                        .cloned(),
                 )?;
 
                 self.release_locations_only_stack(&[dest, val, len]);
@@ -5717,6 +5802,15 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     ]
                     .iter()
                     .cloned(),
+                    [
+                        WpType::I32,
+                        WpType::I32,
+                        WpType::I64,
+                        WpType::I64,
+                        WpType::I64,
+                    ]
+                    .iter()
+                    .cloned(),
                 )?;
 
                 self.release_locations_only_stack(&[dest, src, len]);
@@ -5742,6 +5836,7 @@ impl<'a, M: Machine> FuncGen<'a, M> {
                     },
                     // [vmctx, elem_index]
                     [Location::Imm32(segment)].iter().cloned(),
+                    [WpType::I32].iter().cloned(),
                 )?;
             }
             _ => {
@@ -5762,6 +5857,12 @@ impl<'a, M: Machine> FuncGen<'a, M> {
             .mark_address_with_trap_code(TrapCode::IntegerDivisionByZero);
         self.machine.emit_illegal_op();
 
+        self.machine
+            .emit_label(self.special_labels.integer_overflow);
+        self.machine
+            .mark_address_with_trap_code(TrapCode::IntegerOverflow);
+        self.machine.emit_illegal_op();
+
         self.machine.emit_label(self.special_labels.heap_access_oob);
         self.machine
             .mark_address_with_trap_code(TrapCode::HeapAccessOutOfBounds);
diff --git a/lib/compiler-singlepass/src/compiler.rs b/lib/compiler-singlepass/src/compiler.rs
index 5599f9f3c65..0e0b08f8cb4 100644
--- a/lib/compiler-singlepass/src/compiler.rs
+++ b/lib/compiler-singlepass/src/compiler.rs
@@ -4,9 +4,11 @@
 
 use crate::codegen::FuncGen;
 use crate::config::Singlepass;
+use crate::machine::Machine;
 use crate::machine::{
     gen_import_call_trampoline, gen_std_dynamic_import_trampoline, gen_std_trampoline, CodegenError,
 };
+use crate::machine_arm64::MachineARM64;
 use crate::machine_x64::MachineX86_64;
 use loupe::MemoryUsage;
 #[cfg(feature = "rayon")]
@@ -58,17 +60,18 @@ impl Compiler for SinglepassCompiler {
         _module_translation: &ModuleTranslationState,
         function_body_inputs: PrimaryMap<LocalFunctionIndex, FunctionBodyData<'_>>,
     ) -> Result<Compilation, CompileError> {
-        /*if target.triple().operating_system == OperatingSystem::Windows {
-            return Err(CompileError::UnsupportedTarget(
-                OperatingSystem::Windows.to_string(),
-            ));
-        }*/
-        if target.triple().architecture != Architecture::X86_64 {
-            return Err(CompileError::UnsupportedTarget(
-                target.triple().architecture.to_string(),
-            ));
+        match target.triple().architecture {
+            Architecture::X86_64 => {}
+            Architecture::Aarch64(_) => {}
+            _ => {
+                return Err(CompileError::UnsupportedTarget(
+                    target.triple().architecture.to_string(),
+                ))
+            }
         }
-        if !target.cpu_features().contains(CpuFeature::AVX) {
+        if target.triple().architecture == Architecture::X86_64
+            && !target.cpu_features().contains(CpuFeature::AVX)
+        {
             return Err(CompileError::UnsupportedTarget(
                 "x86_64 without AVX".to_string(),
             ));
@@ -79,7 +82,7 @@ impl Compiler for SinglepassCompiler {
         let calling_convention = match target.triple().default_calling_convention() {
             Ok(CallingConvention::WindowsFastcall) => CallingConvention::WindowsFastcall,
             Ok(CallingConvention::SystemV) => CallingConvention::SystemV,
-            //Ok(CallingConvention::AppleAarch64) => AppleAarch64,
+            Ok(CallingConvention::AppleAarch64) => CallingConvention::AppleAarch64,
             _ => panic!("Unsupported Calling convention for Singlepass compiler"),
         };
 
@@ -126,30 +129,53 @@ impl Compiler for SinglepassCompiler {
                     }
                 }
 
-                let machine = match target.triple().architecture {
-                    Architecture::X86_64 => MachineX86_64::new(),
-                    _ => unimplemented!(),
-                };
-                let mut generator = FuncGen::new(
-                    module,
-                    &self.config,
-                    &vmoffsets,
-                    &memory_styles,
-                    &table_styles,
-                    i,
-                    &locals,
-                    machine,
-                    calling_convention,
-                )
-                .map_err(to_compile_error)?;
+                match target.triple().architecture {
+                    Architecture::X86_64 => {
+                        let machine = MachineX86_64::new();
+                        let mut generator = FuncGen::new(
+                            module,
+                            &self.config,
+                            &vmoffsets,
+                            &memory_styles,
+                            &table_styles,
+                            i,
+                            &locals,
+                            machine,
+                            calling_convention,
+                        )
+                        .map_err(to_compile_error)?;
+                        while generator.has_control_frames() {
+                            generator.set_srcloc(reader.original_position() as u32);
+                            let op = reader.read_operator()?;
+                            generator.feed_operator(op).map_err(to_compile_error)?;
+                        }
 
-                while generator.has_control_frames() {
-                    generator.set_srcloc(reader.original_position() as u32);
-                    let op = reader.read_operator()?;
-                    generator.feed_operator(op).map_err(to_compile_error)?;
-                }
+                        Ok(generator.finalize(&input))
+                    }
+                    Architecture::Aarch64(_) => {
+                        let machine = MachineARM64::new();
+                        let mut generator = FuncGen::new(
+                            module,
+                            &self.config,
+                            &vmoffsets,
+                            &memory_styles,
+                            &table_styles,
+                            i,
+                            &locals,
+                            machine,
+                            calling_convention,
+                        )
+                        .map_err(to_compile_error)?;
+                        while generator.has_control_frames() {
+                            generator.set_srcloc(reader.original_position() as u32);
+                            let op = reader.read_operator()?;
+                            generator.feed_operator(op).map_err(to_compile_error)?;
+                        }
 
-                Ok(generator.finalize(&input))
+                        Ok(generator.finalize(&input))
+                    }
+                    _ => unimplemented!(),
+                }
             })
             .collect::<Result<Vec<CompiledFunction>, CompileError>>()?
             .into_iter()
@@ -252,15 +278,6 @@ mod tests {
     fn errors_for_unsupported_targets() {
         let compiler = SinglepassCompiler::new(Singlepass::default());
 
-        // Compile for win64
-        /*let win64 = Target::new(triple!("x86_64-pc-windows-msvc"), CpuFeature::for_host());
-        let (mut info, translation, inputs) = dummy_compilation_ingredients();
-        let result = compiler.compile_module(&win64, &mut info, &translation, inputs);
-        match result.unwrap_err() {
-            CompileError::UnsupportedTarget(name) => assert_eq!(name, "windows"),
-            error => panic!("Unexpected error: {:?}", error),
-        };*/
-
         // Compile for 32bit Linux
         let linux32 = Target::new(triple!("i686-unknown-linux-gnu"), CpuFeature::for_host());
         let (mut info, translation, inputs) = dummy_compilation_ingredients();
diff --git a/lib/compiler-singlepass/src/emitter_arm64.rs b/lib/compiler-singlepass/src/emitter_arm64.rs
new file mode 100644
index 00000000000..51722a051a1
--- /dev/null
+++ b/lib/compiler-singlepass/src/emitter_arm64.rs
@@ -0,0 +1,2789 @@
+pub use crate::arm64_decl::{ARM64Register, ArgumentRegisterAllocator, GPR, NEON};
+use crate::common_decl::Size;
+use crate::location::Location as AbstractLocation;
+pub use crate::location::{Multiplier, Reg};
+pub use crate::machine::{Label, Offset};
+use dynasm::dynasm;
+pub use dynasmrt::aarch64::{encode_logical_immediate_32bit, encode_logical_immediate_64bit};
+use dynasmrt::{
+    aarch64::Aarch64Relocation, AssemblyOffset, DynamicLabel, DynasmApi, DynasmLabelApi,
+    VecAssembler,
+};
+use wasmer_compiler::{
+    CallingConvention, CustomSection, CustomSectionProtection, FunctionBody, SectionBody,
+};
+use wasmer_types::{FunctionIndex, FunctionType, Type};
+use wasmer_vm::VMOffsets;
+
+type Assembler = VecAssembler<Aarch64Relocation>;
+
+/// Force `dynasm!` to use the correct arch (aarch64) when cross-compiling.
+/// `dynasm!` proc-macro tries to auto-detect it by default by looking at the
+/// `target_arch`, but it sees the `target_arch` of the proc-macro itself, which
+/// is always equal to host, even when cross-compiling.
+macro_rules! dynasm {
+    ($a:expr ; $($tt:tt)*) => {
+        dynasm::dynasm!(
+            $a
+            ; .arch aarch64
+            ; $($tt)*
+        )
+    };
+}
+
+pub type Location = AbstractLocation<GPR, NEON>;
+
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[allow(dead_code)]
+#[repr(u8)]
+pub enum Condition {
+    // meaning for cmp or sub
+    /// Equal
+    Eq = 0,
+    /// Not equal
+    Ne = 1,
+    /// Unsigned higher or same (or carry set)
+    Cs = 2,
+    /// Unsigned lower (or carry clear)
+    Cc = 3,
+    /// Negative. The mnemonic stands for "minus"
+    Mi = 4,
+    /// Positive or zero. The mnemonic stands for "plus"
+    Pl = 5,
+    /// Signed overflow. The mnemonic stands for "V set"
+    Vs = 6,
+    /// No signed overflow. The mnemonic stands for "V clear"
+    Vc = 7,
+    /// Unsigned higher
+    Hi = 8,
+    /// Unsigned lower or same
+    Ls = 9,
+    /// Signed greater than or equal
+    Ge = 10,
+    /// Signed less than
+    Lt = 11,
+    /// Signed greater than
+    Gt = 12,
+    /// Signed less than or equal
+    Le = 13,
+    /// Always executed
+    Al = 14,
+}
+
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[allow(dead_code)]
+pub enum NeonOrMemory {
+    NEON(NEON),
+    Memory(GPR, i32),
+}
+
+#[derive(Copy, Clone, Debug)]
+#[allow(dead_code)]
+pub enum GPROrMemory {
+    GPR(GPR),
+    Memory(GPR, i32),
+}
+
+pub trait EmitterARM64 {
+    fn get_label(&mut self) -> Label;
+    fn get_offset(&self) -> Offset;
+    fn get_jmp_instr_size(&self) -> u8;
+
+    fn finalize_function(&mut self);
+
+    fn emit_str(&mut self, sz: Size, reg: Location, addr: Location);
+    fn emit_ldr(&mut self, sz: Size, reg: Location, addr: Location);
+    fn emit_stur(&mut self, sz: Size, reg: Location, addr: GPR, offset: i32);
+    fn emit_ldur(&mut self, sz: Size, reg: Location, addr: GPR, offset: i32);
+    fn emit_strdb(&mut self, sz: Size, reg: Location, addr: GPR, offset: u32);
+    fn emit_stria(&mut self, sz: Size, reg: Location, addr: GPR, offset: u32);
+    fn emit_ldria(&mut self, sz: Size, reg: Location, addr: GPR, offset: u32);
+    fn emit_stpdb(&mut self, sz: Size, reg1: Location, reg2: Location, addr: GPR, offset: u32);
+    fn emit_ldpia(&mut self, sz: Size, reg1: Location, reg2: Location, addr: GPR, offset: u32);
+
+    fn emit_ldrb(&mut self, sz: Size, reg: Location, dst: Location);
+    fn emit_ldrh(&mut self, sz: Size, reg: Location, dst: Location);
+    fn emit_ldrsb(&mut self, sz: Size, reg: Location, dst: Location);
+    fn emit_ldrsh(&mut self, sz: Size, reg: Location, dst: Location);
+    fn emit_ldrsw(&mut self, sz: Size, reg: Location, dst: Location);
+    fn emit_strb(&mut self, sz: Size, reg: Location, dst: Location);
+    fn emit_strh(&mut self, sz: Size, reg: Location, dst: Location);
+
+    fn emit_mov(&mut self, sz: Size, src: Location, dst: Location);
+
+    fn emit_movn(&mut self, sz: Size, reg: Location, val: u32);
+    fn emit_movz(&mut self, reg: Location, val: u32);
+    fn emit_movk(&mut self, reg: Location, val: u32, shift: u32);
+
+    fn emit_mov_imm(&mut self, dst: Location, val: u64);
+
+    fn emit_add(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_sub(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_mul(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_adds(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_subs(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+
+    fn emit_add_lsl(&mut self, sz: Size, src1: Location, src2: Location, lsl: u32, dst: Location);
+
+    fn emit_cmp(&mut self, sz: Size, src: Location, dst: Location);
+    fn emit_tst(&mut self, sz: Size, src: Location, dst: Location);
+
+    fn emit_lsl(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_lsr(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_asr(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_ror(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+
+    fn emit_or(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_and(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_eor(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+
+    fn emit_bfc(&mut self, se: Size, lsb: u32, width: u32, dst: Location);
+    fn emit_bfi(&mut self, se: Size, src: Location, lsb: u32, width: u32, dst: Location);
+
+    fn emit_udiv(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_sdiv(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    /// msub : c - a*b -> dst
+    fn emit_msub(&mut self, sz: Size, a: Location, b: Location, c: Location, dst: Location);
+
+    fn emit_sxtb(&mut self, sz: Size, src: Location, dst: Location);
+    fn emit_sxth(&mut self, sz: Size, src: Location, dst: Location);
+    fn emit_sxtw(&mut self, sz: Size, src: Location, dst: Location);
+    fn emit_uxtb(&mut self, sz: Size, src: Location, dst: Location);
+    fn emit_uxth(&mut self, sz: Size, src: Location, dst: Location);
+
+    fn emit_cset(&mut self, sz: Size, dst: Location, cond: Condition);
+    fn emit_csetm(&mut self, sz: Size, dst: Location, cond: Condition);
+    fn emit_cinc(&mut self, sz: Size, src: Location, dst: Location, cond: Condition);
+    fn emit_clz(&mut self, sz: Size, src: Location, dst: Location);
+    fn emit_rbit(&mut self, sz: Size, src: Location, dst: Location);
+
+    fn emit_label(&mut self, label: Label);
+    fn emit_load_label(&mut self, reg: GPR, label: Label);
+    fn emit_b_label(&mut self, label: Label);
+    fn emit_cbz_label(&mut self, sz: Size, reg: Location, label: Label);
+    fn emit_cbnz_label(&mut self, sz: Size, reg: Location, label: Label);
+    fn emit_tbz_label(&mut self, sz: Size, reg: Location, n: u32, label: Label);
+    fn emit_tbnz_label(&mut self, sz: Size, reg: Location, n: u32, label: Label);
+    fn emit_bcond_label(&mut self, condition: Condition, label: Label);
+    fn emit_b_register(&mut self, reg: GPR);
+    fn emit_call_label(&mut self, label: Label);
+    fn emit_call_register(&mut self, reg: GPR);
+    fn emit_ret(&mut self);
+
+    fn emit_udf(&mut self);
+    fn emit_dmb(&mut self);
+    fn emit_brk(&mut self);
+
+    fn emit_fcmp(&mut self, sz: Size, src1: Location, src2: Location);
+    fn emit_fneg(&mut self, sz: Size, src: Location, dst: Location);
+    fn emit_fsqrt(&mut self, sz: Size, src: Location, dst: Location);
+
+    fn emit_fadd(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_fsub(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_fmul(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_fdiv(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+
+    fn emit_fmin(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+    fn emit_fmax(&mut self, sz: Size, src1: Location, src2: Location, dst: Location);
+
+    fn emit_frintz(&mut self, sz: Size, src: Location, dst: Location);
+    fn emit_frintn(&mut self, sz: Size, src: Location, dst: Location);
+    fn emit_frintm(&mut self, sz: Size, src: Location, dst: Location);
+    fn emit_frintp(&mut self, sz: Size, src: Location, dst: Location);
+
+    fn emit_scvtf(&mut self, sz_in: Size, src: Location, sz_out: Size, dst: Location);
+    fn emit_ucvtf(&mut self, sz_in: Size, src: Location, sz_out: Size, dst: Location);
+    fn emit_fcvt(&mut self, sz_in: Size, src: Location, dst: Location);
+    fn emit_fcvtzs(&mut self, sz_in: Size, src: Location, sz_out: Size, dst: Location);
+    fn emit_fcvtzu(&mut self, sz_in: Size, src: Location, sz_out: Size, dst: Location);
+
+    fn emit_read_fpcr(&mut self, reg: GPR);
+    fn emit_write_fpcr(&mut self, reg: GPR);
+    fn emit_read_fpsr(&mut self, reg: GPR);
+    fn emit_write_fpsr(&mut self, reg: GPR);
+
+    fn arch_supports_canonicalize_nan(&self) -> bool {
+        true
+    }
+
+    fn arch_requires_indirect_call_trampoline(&self) -> bool {
+        false
+    }
+
+    fn arch_emit_indirect_call_with_trampoline(&mut self, _loc: Location) {
+        unimplemented!()
+    }
+}
+
+impl EmitterARM64 for Assembler {
+    fn get_label(&mut self) -> DynamicLabel {
+        self.new_dynamic_label()
+    }
+
+    fn get_offset(&self) -> AssemblyOffset {
+        self.offset()
+    }
+
+    fn get_jmp_instr_size(&self) -> u8 {
+        4 // relative jump, not full 32bits capable
+    }
+
+    fn finalize_function(&mut self) {
+        dynasm!(
+            self
+            ; const_neg_one_32:
+            ; .word -1
+            ; const_zero_32:
+            ; .word 0
+            ; const_pos_one_32:
+            ; .word 1
+        );
+    }
+
+    fn emit_str(&mut self, sz: Size, reg: Location, addr: Location) {
+        match (sz, reg, addr) {
+            (Size::S64, Location::GPR(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let disp = disp as u32;
+                assert!((disp & 0x7) == 0 && (disp < 0x8000));
+                dynasm!(self ; str X(reg), [X(addr), disp]);
+            }
+            (Size::S32, Location::GPR(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let disp = disp as u32;
+                assert!((disp & 0x3) == 0 && (disp < 0x4000));
+                dynasm!(self ; str W(reg), [X(addr), disp]);
+            }
+            (Size::S16, Location::GPR(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let disp = disp as u32;
+                assert!((disp & 0x1) == 0 && (disp < 0x2000));
+                dynasm!(self ; strh W(reg), [X(addr), disp]);
+            }
+            (Size::S8, Location::GPR(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let disp = disp as u32;
+                assert!(disp < 0x1000);
+                dynasm!(self ; strb W(reg), [X(addr), disp]);
+            }
+            (Size::S64, Location::SIMD(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let disp = disp as u32;
+                assert!((disp & 0x7) == 0 && (disp < 0x8000));
+                dynasm!(self ; str D(reg), [X(addr), disp]);
+            }
+            (Size::S32, Location::SIMD(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let disp = disp as u32;
+                assert!((disp & 0x3) == 0 && (disp < 0x4000));
+                dynasm!(self ; str S(reg), [X(addr), disp]);
+            }
+            (Size::S64, Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; str X(reg), [X(addr)]),
+                    1 => dynasm!(self ; str X(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; str X(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            (Size::S32, Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; str W(reg), [X(addr)]),
+                    1 => dynasm!(self ; str W(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; str W(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            _ => panic!("singlepass can't emit STR {:?}, {:?}, {:?}", sz, reg, addr),
+        }
+    }
+    fn emit_ldr(&mut self, sz: Size, reg: Location, addr: Location) {
+        match (sz, reg, addr) {
+            (Size::S64, Location::GPR(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                assert!((disp & 0x7) == 0 && (disp < 0x8000));
+                let disp = disp as u32;
+                dynasm!(self ; ldr X(reg), [X(addr), disp]);
+            }
+            (Size::S32, Location::GPR(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                assert!((disp & 0x3) == 0 && (disp < 0x4000));
+                let disp = disp as u32;
+                dynasm!(self ; ldr W(reg), [X(addr), disp]);
+            }
+            (Size::S16, Location::GPR(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                assert!((disp & 0x1 == 0) && (disp < 0x2000));
+                let disp = disp as u32;
+                dynasm!(self ; ldrh W(reg), [X(addr), disp]);
+            }
+            (Size::S8, Location::GPR(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                assert!(disp < 0x1000);
+                let disp = disp as u32;
+                dynasm!(self ; ldrb W(reg), [X(addr), disp]);
+            }
+            (Size::S64, Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; ldr X(reg), [X(addr)]),
+                    1 => dynasm!(self ; ldr X(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; ldr X(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            (Size::S32, Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; ldr W(reg), [X(addr)]),
+                    1 => dynasm!(self ; ldr W(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; ldr W(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            (Size::S64, Location::SIMD(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let disp = disp as u32;
+                assert!((disp & 0x7) == 0 && (disp < 0x8000));
+                dynasm!(self ; ldr D(reg), [X(addr), disp]);
+            }
+            (Size::S32, Location::SIMD(reg), Location::Memory(addr, disp)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let disp = disp as u32;
+                assert!((disp & 0x3) == 0 && (disp < 0x4000));
+                dynasm!(self ; ldr S(reg), [X(addr), disp]);
+            }
+            _ => panic!("singlepass can't emit LDR {:?}, {:?}, {:?}", sz, reg, addr),
+        }
+    }
+    fn emit_stur(&mut self, sz: Size, reg: Location, addr: GPR, offset: i32) {
+        assert!((offset >= -255) && (offset <= 255));
+        match (sz, reg) {
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; stur X(reg), [X(addr), offset]);
+            }
+            (Size::S32, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; stur W(reg), [X(addr), offset]);
+            }
+            (Size::S64, Location::SIMD(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; stur D(reg), [X(addr), offset]);
+            }
+            _ => panic!(
+                "singlepass can't emit STUR {:?}, {:?}, {:?}, {:?}",
+                sz, reg, addr, offset
+            ),
+        }
+    }
+    fn emit_ldur(&mut self, sz: Size, reg: Location, addr: GPR, offset: i32) {
+        assert!((offset >= -255) && (offset <= 255));
+        match (sz, reg) {
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; ldur X(reg), [X(addr), offset]);
+            }
+            (Size::S32, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; ldur W(reg), [X(addr), offset]);
+            }
+            (Size::S64, Location::SIMD(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; ldur D(reg), [X(addr), offset]);
+            }
+            _ => panic!(
+                "singlepass can't emit LDUR {:?}, {:?}, {:?}, {:?}",
+                sz, reg, addr, offset
+            ),
+        }
+    }
+
+    fn emit_strdb(&mut self, sz: Size, reg: Location, addr: GPR, offset: u32) {
+        assert!(offset <= 255);
+        match (sz, reg) {
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; str X(reg), [X(addr), -(offset as i32)]!);
+            }
+            (Size::S64, Location::SIMD(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; str D(reg), [X(addr), -(offset as i32)]!);
+            }
+            _ => unreachable!(),
+        }
+    }
+    fn emit_stria(&mut self, sz: Size, reg: Location, addr: GPR, offset: u32) {
+        assert!(offset <= 255);
+        match (sz, reg) {
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; str X(reg), [X(addr)], (offset as i32));
+            }
+            (Size::S64, Location::SIMD(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; str D(reg), [X(addr)], (offset as i32));
+            }
+            _ => unreachable!(),
+        }
+    }
+    fn emit_ldria(&mut self, sz: Size, reg: Location, addr: GPR, offset: u32) {
+        assert!(offset <= 255);
+        match (sz, reg) {
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; ldr X(reg), [X(addr)], offset);
+            }
+            (Size::S64, Location::SIMD(reg)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; ldr D(reg), [X(addr)], offset);
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    fn emit_stpdb(&mut self, sz: Size, reg1: Location, reg2: Location, addr: GPR, offset: u32) {
+        assert!(offset <= 255);
+        match (sz, reg1, reg2) {
+            (Size::S64, Location::GPR(reg1), Location::GPR(reg2)) => {
+                let reg1 = reg1.into_index() as u32;
+                let reg2 = reg2.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; stp X(reg1), X(reg2), [X(addr), -(offset as i32)]!);
+            }
+            _ => unreachable!(),
+        }
+    }
+    fn emit_ldpia(&mut self, sz: Size, reg1: Location, reg2: Location, addr: GPR, offset: u32) {
+        assert!(offset <= 255);
+        match (sz, reg1, reg2) {
+            (Size::S64, Location::GPR(reg1), Location::GPR(reg2)) => {
+                let reg1 = reg1.into_index() as u32;
+                let reg2 = reg2.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                dynasm!(self ; ldp X(reg1), X(reg2), [X(addr)], offset);
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    fn emit_ldrb(&mut self, _sz: Size, reg: Location, dst: Location) {
+        match (reg, dst) {
+            (Location::GPR(reg), Location::Memory(addr, offset)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let offset = offset as u32;
+                assert!(offset < 0x1000);
+                dynasm!(self ; ldrb W(reg), [X(addr), offset]);
+            }
+            (Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; ldrb W(reg), [X(addr)]),
+                    1 => dynasm!(self ; ldrb W(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; ldrb W(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            _ => panic!("singlepass can't emit LDRB {:?}, {:?}", reg, dst),
+        }
+    }
+    fn emit_ldrh(&mut self, _sz: Size, reg: Location, dst: Location) {
+        match (reg, dst) {
+            (Location::GPR(reg), Location::Memory(addr, offset)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let offset = offset as u32;
+                assert!((offset & 1 == 0) && (offset < 0x2000));
+                dynasm!(self ; ldrh W(reg), [X(addr), offset]);
+            }
+            (Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; ldrh W(reg), [X(addr)]),
+                    1 => dynasm!(self ; ldrh W(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; ldrh W(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            _ => panic!("singlepass can't emit LDRH {:?}, {:?}", reg, dst),
+        }
+    }
+    fn emit_ldrsb(&mut self, sz: Size, reg: Location, dst: Location) {
+        match (sz, reg, dst) {
+            (Size::S64, Location::GPR(reg), Location::Memory(addr, offset)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let offset = offset as u32;
+                assert!(offset < 0x1000);
+                dynasm!(self ; ldrsb X(reg), [X(addr), offset]);
+            }
+            (Size::S32, Location::GPR(reg), Location::Memory(addr, offset)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let offset = offset as u32;
+                assert!(offset < 0x1000);
+                dynasm!(self ; ldrsb W(reg), [X(addr), offset]);
+            }
+            (Size::S64, Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; ldrsb X(reg), [X(addr)]),
+                    1 => dynasm!(self ; ldrsb X(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; ldrsb X(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            (Size::S32, Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; ldrsb W(reg), [X(addr)]),
+                    1 => dynasm!(self ; ldrsb W(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; ldrsb W(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            _ => panic!("singlepass can't emit LDRSB {:?}, {:?}, {:?}", sz, reg, dst),
+        }
+    }
+    fn emit_ldrsh(&mut self, sz: Size, reg: Location, dst: Location) {
+        match (sz, reg, dst) {
+            (Size::S64, Location::GPR(reg), Location::Memory(addr, offset)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let offset = offset as u32;
+                assert!((offset & 1 == 0) && (offset < 0x2000));
+                dynasm!(self ; ldrsh X(reg), [X(addr), offset]);
+            }
+            (Size::S32, Location::GPR(reg), Location::Memory(addr, offset)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let offset = offset as u32;
+                assert!((offset & 1 == 0) && (offset < 0x2000));
+                dynasm!(self ; ldrsh W(reg), [X(addr), offset]);
+            }
+            (Size::S64, Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; ldrsh X(reg), [X(addr)]),
+                    1 => dynasm!(self ; ldrsh X(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; ldrsh X(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            (Size::S32, Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; ldrsh W(reg), [X(addr)]),
+                    1 => dynasm!(self ; ldrsh W(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; ldrsh W(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            _ => panic!("singlepass can't emit LDRSH {:?}, {:?}, {:?}", sz, reg, dst),
+        }
+    }
+    fn emit_ldrsw(&mut self, sz: Size, reg: Location, dst: Location) {
+        match (sz, reg, dst) {
+            (Size::S64, Location::GPR(reg), Location::Memory(addr, offset)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let offset = offset as u32;
+                assert!((offset & 3 == 0) && (offset < 0x4000));
+                dynasm!(self ; ldrsw X(reg), [X(addr), offset]);
+            }
+            (Size::S64, Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; ldrsw X(reg), [X(addr)]),
+                    1 => dynasm!(self ; ldrsw X(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; ldrsw X(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            _ => panic!("singlepass can't emit LDRSW {:?}, {:?}, {:?}", sz, reg, dst),
+        }
+    }
+    fn emit_strb(&mut self, _sz: Size, reg: Location, dst: Location) {
+        match (reg, dst) {
+            (Location::GPR(reg), Location::Memory(addr, offset)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let offset = offset as u32;
+                assert!(offset < 0x1000);
+                dynasm!(self ; strb W(reg), [X(addr), offset]);
+            }
+            (Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; strb W(reg), [X(addr)]),
+                    1 => dynasm!(self ; strb W(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; strb W(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            _ => panic!("singlepass can't emit STRB {:?}, {:?}", reg, dst),
+        }
+    }
+    fn emit_strh(&mut self, _sz: Size, reg: Location, dst: Location) {
+        match (reg, dst) {
+            (Location::GPR(reg), Location::Memory(addr, offset)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let offset = offset as u32;
+                assert!((offset & 1 == 0) && (offset < 0x2000));
+                dynasm!(self ; strh W(reg), [X(addr), offset]);
+            }
+            (Location::GPR(reg), Location::Memory2(addr, r2, mult, offs)) => {
+                let reg = reg.into_index() as u32;
+                let addr = addr.into_index() as u32;
+                let r2 = r2.into_index() as u32;
+                assert!(offs == 0);
+                let mult = mult as u32;
+                match mult {
+                    0 => dynasm!(self ; strh W(reg), [X(addr)]),
+                    1 => dynasm!(self ; strh W(reg), [X(addr), X(r2)]),
+                    _ => dynasm!(self ; strh W(reg), [X(addr), X(r2), LSL mult]),
+                };
+            }
+            _ => panic!("singlepass can't emit STRH {:?}, {:?}", reg, dst),
+        }
+    }
+
+    fn emit_mov(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S64, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; mov X(dst), X(src));
+            }
+            (Size::S32, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; mov W(dst), W(src));
+            }
+            (Size::S64, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; mov V(dst).D[0], V(src).D[0]);
+            }
+            (Size::S32, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; mov V(dst).S[0], V(src).S[0]);
+            }
+            (Size::S64, Location::GPR(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; mov V(dst).D[0], X(src));
+            }
+            (Size::S32, Location::GPR(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; mov V(dst).S[0], W(src));
+            }
+            (Size::S64, Location::SIMD(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; mov X(dst), V(src).D[0]);
+            }
+            (Size::S32, Location::SIMD(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; mov W(dst), V(src).S[0]);
+            }
+            (Size::S32, Location::Imm32(val), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                if val < 0x1000 {
+                    dynasm!(self ; mov W(dst), val as u64);
+                } else if encode_logical_immediate_32bit(val as _).is_some() {
+                    dynasm!(self ; orr W(dst), wzr, val);
+                } else {
+                    unreachable!();
+                }
+            }
+            (Size::S64, Location::Imm32(val), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                if val < 0x1000 {
+                    dynasm!(self ; mov W(dst), val as u64);
+                } else if encode_logical_immediate_64bit(val as _).is_some() {
+                    dynasm!(self ; orr X(dst), xzr, val as u64);
+                } else {
+                    unreachable!();
+                }
+            }
+            (Size::S64, Location::Imm64(val), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                if val < 0x1000 {
+                    dynasm!(self ; mov W(dst), val as u64);
+                } else if encode_logical_immediate_64bit(val as _).is_some() {
+                    dynasm!(self ; orr X(dst), xzr, val as u64);
+                } else {
+                    unreachable!();
+                }
+            }
+            _ => panic!("singlepass can't emit MOV {:?}, {:?}, {:?}", sz, src, dst),
+        }
+    }
+
+    fn emit_movn(&mut self, sz: Size, reg: Location, val: u32) {
+        match (sz, reg) {
+            (Size::S32, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; movn W(reg), val);
+            }
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; movn X(reg), val);
+            }
+            _ => unreachable!(),
+        }
+    }
+    fn emit_movz(&mut self, reg: Location, val: u32) {
+        match reg {
+            Location::GPR(reg) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; movz W(reg), val);
+            }
+            _ => unreachable!(),
+        }
+    }
+    fn emit_movk(&mut self, reg: Location, val: u32, shift: u32) {
+        match reg {
+            Location::GPR(reg) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; movk X(reg), val, LSL shift);
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    fn emit_mov_imm(&mut self, dst: Location, val: u64) {
+        match dst {
+            Location::GPR(dst) => {
+                let dst = dst.into_index() as u32;
+                let offset = val.trailing_zeros() & 48;
+                let masked = 0xffff & (val >> offset);
+                if (masked << offset) == val {
+                    dynasm!(self ; movz X(dst), masked as u32, LSL offset);
+                } else {
+                    dynasm!(self ; movz W(dst), (val&0xffff) as u32);
+                    let val = val >> 16;
+                    if val != 0 {
+                        dynasm!(self ; movk X(dst), (val&0xffff) as u32, LSL 16);
+                        let val = val >> 16;
+                        if val != 0 {
+                            dynasm!(self ; movk X(dst), (val&0xffff) as u32, LSL 32);
+                            let val = val >> 16;
+                            if val != 0 {
+                                dynasm!(self ; movk X(dst), (val&0xffff) as u32, LSL 48);
+                            }
+                        }
+                    }
+                }
+            }
+            _ => panic!("singlepass can't emit MOVW {:?}", dst),
+        }
+    }
+
+    fn emit_add(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; add X(dst), X(src1), X(src2));
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; add W(dst), W(src1), W(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; add X(dst), X(src1), imm as u32);
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm32(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                dynasm!(self ; add X(dst), X(src1), imm);
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm64(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm64(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                let imm = imm as u32;
+                dynasm!(self ; add X(dst), X(src1), imm);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; add W(dst), W(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm32(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                dynasm!(self ; add W(dst), W(src1), imm);
+            }
+            _ => panic!(
+                "singlepass can't emit ADD {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_sub(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sub X(dst), X(src1), X(src2));
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sub W(dst), W(src1), W(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sub X(dst), X(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sub W(dst), W(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                dynasm!(self ; sub W(dst), W(src1), imm);
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                dynasm!(self ; sub X(dst), X(src1), imm);
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm64(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                dynasm!(self ; sub X(dst), X(src1), imm as u32);
+            }
+            _ => panic!(
+                "singlepass can't emit SUB {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_mul(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; mul X(dst), X(src1), X(src2));
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; mul W(dst), W(src1), W(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit MUL {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_adds(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; adds X(dst), X(src1), X(src2));
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; adds W(dst), W(src1), W(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; adds X(dst), X(src1), imm as u32);
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm32(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                dynasm!(self ; adds X(dst), X(src1), imm);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; adds W(dst), W(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm32(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                dynasm!(self ; adds W(dst), W(src1), imm);
+            }
+            _ => panic!(
+                "singlepass can't emit ADD.S {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_subs(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; subs X(dst), X(src1), X(src2));
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; subs W(dst), W(src1), W(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; subs X(dst), X(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; subs W(dst), W(src1), imm as u32);
+            }
+            _ => panic!(
+                "singlepass can't emit SUB.S {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_add_lsl(&mut self, sz: Size, src1: Location, src2: Location, lsl: u32, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; add X(dst), X(src1), X(src2), LSL lsl);
+            }
+            _ => panic!(
+                "singlepass can't emit LSL {:?} {:?} {:?} {:?} LSL {:?}",
+                sz, src1, src2, dst, lsl
+            ),
+        }
+    }
+
+    fn emit_cmp(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S64, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; cmp X(dst), X(src));
+            }
+            (Size::S32, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; cmp W(dst), W(src));
+            }
+            (Size::S64, Location::Imm8(imm), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; cmp X(dst), imm as u32);
+            }
+            (Size::S64, Location::Imm32(imm), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                dynasm!(self ; cmp X(dst), imm as u32);
+            }
+            (Size::S64, Location::Imm64(imm), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                dynasm!(self ; cmp X(dst), imm as u32);
+            }
+            (Size::S32, Location::Imm8(imm), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; cmp W(dst), imm as u32);
+            }
+            (Size::S32, Location::Imm32(imm), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                if imm >= 0x1000 {
+                    unreachable!();
+                }
+                dynasm!(self ; cmp W(dst), imm as u32);
+            }
+            _ => panic!("singlepass can't emit CMP {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+
+    fn emit_tst(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S64, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; tst X(dst), X(src));
+            }
+            (Size::S64, Location::Imm32(imm), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                if !encode_logical_immediate_64bit(imm as u64).is_some() {
+                    unreachable!();
+                }
+                dynasm!(self ; tst X(dst), imm as u64);
+            }
+            (Size::S64, Location::Imm64(imm), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                if !encode_logical_immediate_64bit(imm as u64).is_some() {
+                    unreachable!();
+                }
+                dynasm!(self ; tst X(dst), imm as u64);
+            }
+            (Size::S32, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; tst W(dst), W(src));
+            }
+            (Size::S32, Location::Imm32(imm), Location::GPR(dst)) => {
+                let dst = dst.into_index() as u32;
+                if !encode_logical_immediate_64bit(imm as u64).is_some() {
+                    unreachable!();
+                }
+                dynasm!(self ; tst W(dst), imm);
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    fn emit_lsl(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; lsl X(dst), X(src1), X(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                if imm > 63 {
+                    unreachable!();
+                }
+                let imm = imm as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; lsl X(dst), X(src1), imm);
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; lsl W(dst), W(src1), W(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm > 63 {
+                    unreachable!();
+                }
+                dynasm!(self ; lsl X(dst), X(src1), imm as u32);
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm64(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm64(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm > 63 {
+                    unreachable!();
+                }
+                dynasm!(self ; lsl X(dst), X(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm > 31 {
+                    unreachable!();
+                }
+                dynasm!(self ; lsl W(dst), W(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm32(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm > 31 {
+                    unreachable!();
+                }
+                dynasm!(self ; lsl W(dst), W(src1), imm as u32);
+            }
+            _ => panic!(
+                "singlepass can't emit LSL {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_asr(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; asr X(dst), X(src1), X(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let imm = imm as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 63 {
+                    unreachable!();
+                }
+                dynasm!(self ; asr X(dst), X(src1), imm);
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; asr W(dst), W(src1), W(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 63 {
+                    unreachable!();
+                }
+                dynasm!(self ; asr X(dst), X(src1), imm as u32);
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm64(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm64(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 63 {
+                    unreachable!();
+                }
+                dynasm!(self ; asr X(dst), X(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 31 {
+                    unreachable!();
+                }
+                dynasm!(self ; asr W(dst), W(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm32(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 31 {
+                    unreachable!();
+                }
+                dynasm!(self ; asr W(dst), W(src1), imm as u32);
+            }
+            _ => panic!(
+                "singlepass can't emit ASR {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_lsr(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; lsr X(dst), X(src1), X(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let imm = imm as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 63 {
+                    unreachable!();
+                }
+                dynasm!(self ; lsr X(dst), X(src1), imm);
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; lsr W(dst), W(src1), W(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 63 {
+                    unreachable!();
+                }
+                dynasm!(self ; lsr X(dst), X(src1), imm as u32);
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm64(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm64(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 63 {
+                    unreachable!();
+                }
+                dynasm!(self ; lsr X(dst), X(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 31 {
+                    unreachable!();
+                }
+                dynasm!(self ; lsr W(dst), W(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm32(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 31 {
+                    unreachable!();
+                }
+                dynasm!(self ; lsr W(dst), W(src1), imm as u32);
+            }
+            _ => panic!(
+                "singlepass can't emit LSR {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_ror(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; ror X(dst), X(src1), X(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm32(imm), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let imm = imm as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 63 {
+                    unreachable!();
+                }
+                dynasm!(self ; ror X(dst), X(src1), imm);
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; ror W(dst), W(src1), W(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S64, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 63 {
+                    unreachable!();
+                }
+                dynasm!(self ; ror X(dst), X(src1), imm as u32);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm8(imm), Location::GPR(dst))
+            | (Size::S32, Location::Imm8(imm), Location::GPR(src1), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                if imm == 0 || imm > 31 {
+                    unreachable!();
+                }
+                dynasm!(self ; ror W(dst), W(src1), imm as u32);
+            }
+            _ => panic!(
+                "singlepass can't emit ROR {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+
+    fn emit_or(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; orr X(dst), X(src1), X(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm64(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2 as u64;
+                let dst = dst.into_index() as u32;
+                if !encode_logical_immediate_64bit(src2 as u64).is_some() {
+                    unreachable!();
+                }
+                dynasm!(self ; orr X(dst), X(src1), src2);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm32(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2 as u32;
+                let dst = dst.into_index() as u32;
+                if !encode_logical_immediate_32bit(src2).is_some() {
+                    unreachable!();
+                }
+                dynasm!(self ; orr W(dst), W(src1), src2);
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; orr W(dst), W(src1), W(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit OR {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_and(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; and X(dst), X(src1), X(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm64(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2 as u64;
+                let dst = dst.into_index() as u32;
+                if !encode_logical_immediate_64bit(src2 as u64).is_some() {
+                    unreachable!();
+                }
+                dynasm!(self ; and X(dst), X(src1), src2);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm32(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2 as u32;
+                let dst = dst.into_index() as u32;
+                if !encode_logical_immediate_32bit(src2).is_some() {
+                    unreachable!();
+                }
+                dynasm!(self ; and W(dst), W(src1), src2);
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; and W(dst), W(src1), W(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit AND {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_eor(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; eor X(dst), X(src1), X(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::Imm64(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2 as u64;
+                let dst = dst.into_index() as u32;
+                if !encode_logical_immediate_64bit(src2 as u64).is_some() {
+                    unreachable!();
+                }
+                dynasm!(self ; eor X(dst), X(src1), src2);
+            }
+            (Size::S32, Location::GPR(src1), Location::Imm32(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2 as u32;
+                let dst = dst.into_index() as u32;
+                if !encode_logical_immediate_32bit(src2).is_some() {
+                    unreachable!();
+                }
+                dynasm!(self ; eor W(dst), W(src1), src2);
+            }
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; eor W(dst), W(src1), W(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit EOR {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+
+    fn emit_bfc(&mut self, sz: Size, lsb: u32, width: u32, dst: Location) {
+        match (sz, dst) {
+            (Size::S32, Location::GPR(dst)) => {
+                dynasm!(self ; bfc W(dst as u32), lsb, width);
+            }
+            (Size::S64, Location::GPR(dst)) => {
+                dynasm!(self ; bfc X(dst as u32), lsb, width);
+            }
+            _ => unimplemented!(),
+        }
+    }
+    fn emit_bfi(&mut self, sz: Size, src: Location, lsb: u32, width: u32, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S32, Location::GPR(src), Location::GPR(dst)) => {
+                dynasm!(self ; bfi W(dst as u32), W(src as u32), lsb, width);
+            }
+            (Size::S64, Location::GPR(src), Location::GPR(dst)) => {
+                dynasm!(self ; bfi X(dst as u32), X(src as u32), lsb, width);
+            }
+            _ => unimplemented!(),
+        }
+    }
+
+    fn emit_udiv(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; udiv W(dst), W(src1), W(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; udiv X(dst), X(src1), X(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit UDIV {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_sdiv(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S32, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sdiv W(dst), W(src1), W(src2));
+            }
+            (Size::S64, Location::GPR(src1), Location::GPR(src2), Location::GPR(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sdiv X(dst), X(src1), X(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit UDIV {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+
+    /// msub : c - a*b -> dst
+    fn emit_msub(&mut self, sz: Size, a: Location, b: Location, c: Location, dst: Location) {
+        match (sz, a, b, c, dst) {
+            (
+                Size::S32,
+                Location::GPR(a),
+                Location::GPR(b),
+                Location::GPR(c),
+                Location::GPR(dst),
+            ) => {
+                let a = a.into_index() as u32;
+                let b = b.into_index() as u32;
+                let c = c.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; msub W(dst), W(a), W(b), W(c));
+            }
+            (
+                Size::S64,
+                Location::GPR(a),
+                Location::GPR(b),
+                Location::GPR(c),
+                Location::GPR(dst),
+            ) => {
+                let a = a.into_index() as u32;
+                let b = b.into_index() as u32;
+                let c = c.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; msub X(dst), X(a), X(b), X(c));
+            }
+            _ => panic!(
+                "singlepass can't emit msub {:?} {:?} {:?} {:?} {:?}",
+                sz, a, b, c, dst
+            ),
+        }
+    }
+
+    fn emit_sxtb(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S32, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sxtb W(dst), W(src));
+            }
+            (Size::S64, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sxtb X(dst), W(src));
+            }
+            _ => panic!("singlepass can't emit SXTB {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+    fn emit_sxth(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S32, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sxth W(dst), W(src));
+            }
+            (Size::S64, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sxth X(dst), W(src));
+            }
+            _ => panic!("singlepass can't emit SXTH {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+    fn emit_sxtw(&mut self, _sz: Size, src: Location, dst: Location) {
+        match (src, dst) {
+            (Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; sxtw X(dst), W(src));
+            }
+            _ => panic!("singlepass can't emit SXTW {:?} {:?}", src, dst),
+        }
+    }
+    fn emit_uxtb(&mut self, _sz: Size, src: Location, dst: Location) {
+        match (src, dst) {
+            (Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; uxtb W(dst), W(src));
+            }
+            _ => panic!("singlepass can't emit UXTB {:?} {:?}", src, dst),
+        }
+    }
+    fn emit_uxth(&mut self, _sz: Size, src: Location, dst: Location) {
+        match (src, dst) {
+            (Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; uxth W(dst), W(src));
+            }
+            _ => panic!("singlepass can't emit UXTH {:?} {:?}", src, dst),
+        }
+    }
+
+    fn emit_cset(&mut self, sz: Size, dst: Location, cond: Condition) {
+        match (sz, dst) {
+            (Size::S32, Location::GPR(reg)) => {
+                let reg = reg as u32;
+                match cond {
+                    Condition::Eq => dynasm!(self ; cset W(reg), eq),
+                    Condition::Ne => dynasm!(self ; cset W(reg), ne),
+                    Condition::Cs => dynasm!(self ; cset W(reg), cs),
+                    Condition::Cc => dynasm!(self ; cset W(reg), cc),
+                    Condition::Mi => dynasm!(self ; cset W(reg), mi),
+                    Condition::Pl => dynasm!(self ; cset W(reg), pl),
+                    Condition::Vs => dynasm!(self ; cset W(reg), vs),
+                    Condition::Vc => dynasm!(self ; cset W(reg), vc),
+                    Condition::Hi => dynasm!(self ; cset W(reg), hi),
+                    Condition::Ls => dynasm!(self ; cset W(reg), ls),
+                    Condition::Ge => dynasm!(self ; cset W(reg), ge),
+                    Condition::Lt => dynasm!(self ; cset W(reg), lt),
+                    Condition::Gt => dynasm!(self ; cset W(reg), gt),
+                    Condition::Le => dynasm!(self ; cset W(reg), le),
+                    Condition::Al => dynasm!(self ; cset W(reg), al),
+                }
+            }
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg as u32;
+                match cond {
+                    Condition::Eq => dynasm!(self ; cset X(reg), eq),
+                    Condition::Ne => dynasm!(self ; cset X(reg), ne),
+                    Condition::Cs => dynasm!(self ; cset X(reg), cs),
+                    Condition::Cc => dynasm!(self ; cset X(reg), cc),
+                    Condition::Mi => dynasm!(self ; cset X(reg), mi),
+                    Condition::Pl => dynasm!(self ; cset X(reg), pl),
+                    Condition::Vs => dynasm!(self ; cset X(reg), vs),
+                    Condition::Vc => dynasm!(self ; cset X(reg), vc),
+                    Condition::Hi => dynasm!(self ; cset X(reg), hi),
+                    Condition::Ls => dynasm!(self ; cset X(reg), ls),
+                    Condition::Ge => dynasm!(self ; cset X(reg), ge),
+                    Condition::Lt => dynasm!(self ; cset X(reg), lt),
+                    Condition::Gt => dynasm!(self ; cset X(reg), gt),
+                    Condition::Le => dynasm!(self ; cset X(reg), le),
+                    Condition::Al => dynasm!(self ; cset X(reg), al),
+                }
+            }
+            _ => panic!("singlepass can't emit CSET {:?} {:?} {:?}", sz, dst, cond),
+        }
+    }
+    fn emit_csetm(&mut self, sz: Size, dst: Location, cond: Condition) {
+        match (sz, dst) {
+            (Size::S32, Location::GPR(reg)) => {
+                let reg = reg as u32;
+                match cond {
+                    Condition::Eq => dynasm!(self ; csetm W(reg), eq),
+                    Condition::Ne => dynasm!(self ; csetm W(reg), ne),
+                    Condition::Cs => dynasm!(self ; csetm W(reg), cs),
+                    Condition::Cc => dynasm!(self ; csetm W(reg), cc),
+                    Condition::Mi => dynasm!(self ; csetm W(reg), mi),
+                    Condition::Pl => dynasm!(self ; csetm W(reg), pl),
+                    Condition::Vs => dynasm!(self ; csetm W(reg), vs),
+                    Condition::Vc => dynasm!(self ; csetm W(reg), vc),
+                    Condition::Hi => dynasm!(self ; csetm W(reg), hi),
+                    Condition::Ls => dynasm!(self ; csetm W(reg), ls),
+                    Condition::Ge => dynasm!(self ; csetm W(reg), ge),
+                    Condition::Lt => dynasm!(self ; csetm W(reg), lt),
+                    Condition::Gt => dynasm!(self ; csetm W(reg), gt),
+                    Condition::Le => dynasm!(self ; csetm W(reg), le),
+                    Condition::Al => dynasm!(self ; csetm W(reg), al),
+                }
+            }
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg as u32;
+                match cond {
+                    Condition::Eq => dynasm!(self ; csetm X(reg), eq),
+                    Condition::Ne => dynasm!(self ; csetm X(reg), ne),
+                    Condition::Cs => dynasm!(self ; csetm X(reg), cs),
+                    Condition::Cc => dynasm!(self ; csetm X(reg), cc),
+                    Condition::Mi => dynasm!(self ; csetm X(reg), mi),
+                    Condition::Pl => dynasm!(self ; csetm X(reg), pl),
+                    Condition::Vs => dynasm!(self ; csetm X(reg), vs),
+                    Condition::Vc => dynasm!(self ; csetm X(reg), vc),
+                    Condition::Hi => dynasm!(self ; csetm X(reg), hi),
+                    Condition::Ls => dynasm!(self ; csetm X(reg), ls),
+                    Condition::Ge => dynasm!(self ; csetm X(reg), ge),
+                    Condition::Lt => dynasm!(self ; csetm X(reg), lt),
+                    Condition::Gt => dynasm!(self ; csetm X(reg), gt),
+                    Condition::Le => dynasm!(self ; csetm X(reg), le),
+                    Condition::Al => dynasm!(self ; csetm X(reg), al),
+                }
+            }
+            _ => panic!("singlepass can't emit CSETM {:?} {:?} {:?}", sz, dst, cond),
+        }
+    }
+    fn emit_cinc(&mut self, sz: Size, src: Location, dst: Location, cond: Condition) {
+        match (sz, src, dst) {
+            (Size::S32, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                match cond {
+                    Condition::Eq => dynasm!(self ; cinc W(dst), W(src), eq),
+                    Condition::Ne => dynasm!(self ; cinc W(dst), W(src), ne),
+                    Condition::Cs => dynasm!(self ; cinc W(dst), W(src), cs),
+                    Condition::Cc => dynasm!(self ; cinc W(dst), W(src), cc),
+                    Condition::Mi => dynasm!(self ; cinc W(dst), W(src), mi),
+                    Condition::Pl => dynasm!(self ; cinc W(dst), W(src), pl),
+                    Condition::Vs => dynasm!(self ; cinc W(dst), W(src), vs),
+                    Condition::Vc => dynasm!(self ; cinc W(dst), W(src), vc),
+                    Condition::Hi => dynasm!(self ; cinc W(dst), W(src), hi),
+                    Condition::Ls => dynasm!(self ; cinc W(dst), W(src), ls),
+                    Condition::Ge => dynasm!(self ; cinc W(dst), W(src), ge),
+                    Condition::Lt => dynasm!(self ; cinc W(dst), W(src), lt),
+                    Condition::Gt => dynasm!(self ; cinc W(dst), W(src), gt),
+                    Condition::Le => dynasm!(self ; cinc W(dst), W(src), le),
+                    Condition::Al => dynasm!(self ; cinc W(dst), W(src), al),
+                };
+            }
+            (Size::S64, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                match cond {
+                    Condition::Eq => dynasm!(self ; cinc X(src), X(dst), eq),
+                    Condition::Ne => dynasm!(self ; cinc X(src), X(dst), ne),
+                    Condition::Cs => dynasm!(self ; cinc X(src), X(dst), cs),
+                    Condition::Cc => dynasm!(self ; cinc X(src), X(dst), cc),
+                    Condition::Mi => dynasm!(self ; cinc X(src), X(dst), mi),
+                    Condition::Pl => dynasm!(self ; cinc X(src), X(dst), pl),
+                    Condition::Vs => dynasm!(self ; cinc X(src), X(dst), vs),
+                    Condition::Vc => dynasm!(self ; cinc X(src), X(dst), vc),
+                    Condition::Hi => dynasm!(self ; cinc X(src), X(dst), hi),
+                    Condition::Ls => dynasm!(self ; cinc X(src), X(dst), ls),
+                    Condition::Ge => dynasm!(self ; cinc X(src), X(dst), ge),
+                    Condition::Lt => dynasm!(self ; cinc X(src), X(dst), lt),
+                    Condition::Gt => dynasm!(self ; cinc X(src), X(dst), gt),
+                    Condition::Le => dynasm!(self ; cinc X(src), X(dst), le),
+                    Condition::Al => dynasm!(self ; cinc X(src), X(dst), al),
+                };
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    fn emit_clz(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S64, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; clz X(dst), X(src));
+            }
+            (Size::S32, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; clz W(dst), W(src));
+            }
+            _ => panic!("singlepass can't emit CLS {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+    fn emit_rbit(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S64, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; rbit X(dst), X(src));
+            }
+            (Size::S32, Location::GPR(src), Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; rbit W(dst), W(src));
+            }
+            _ => panic!("singlepass can't emit CLS {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+
+    fn emit_label(&mut self, label: Label) {
+        dynasm!(self ; => label);
+    }
+    fn emit_load_label(&mut self, reg: GPR, label: Label) {
+        let reg = reg.into_index() as u32;
+        dynasm!(self ; adr X(reg), =>label);
+    }
+    fn emit_b_label(&mut self, label: Label) {
+        dynasm!(self ; b =>label);
+    }
+    fn emit_cbz_label(&mut self, sz: Size, reg: Location, label: Label) {
+        match (sz, reg) {
+            (Size::S32, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; cbz W(reg), =>label);
+            }
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; cbz X(reg), =>label);
+            }
+            _ => panic!("singlepass can't emit CBZ {:?} {:?} {:?}", sz, reg, label),
+        }
+    }
+    fn emit_cbnz_label(&mut self, sz: Size, reg: Location, label: Label) {
+        match (sz, reg) {
+            (Size::S32, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; cbnz W(reg), =>label);
+            }
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; cbnz X(reg), =>label);
+            }
+            _ => panic!("singlepass can't emit CBNZ {:?} {:?} {:?}", sz, reg, label),
+        }
+    }
+    fn emit_tbz_label(&mut self, sz: Size, reg: Location, n: u32, label: Label) {
+        match (sz, reg) {
+            (Size::S32, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; tbz W(reg), n, =>label);
+            }
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; tbz X(reg), n, =>label);
+            }
+            _ => panic!(
+                "singlepass can't emit TBZ {:?} {:?} {:?} {:?}",
+                sz, reg, n, label
+            ),
+        }
+    }
+    fn emit_tbnz_label(&mut self, sz: Size, reg: Location, n: u32, label: Label) {
+        match (sz, reg) {
+            (Size::S32, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; tbnz W(reg), n, =>label);
+            }
+            (Size::S64, Location::GPR(reg)) => {
+                let reg = reg.into_index() as u32;
+                dynasm!(self ; tbnz X(reg), n, =>label);
+            }
+            _ => panic!(
+                "singlepass can't emit TBNZ {:?} {:?} {:?} {:?}",
+                sz, reg, n, label
+            ),
+        }
+    }
+    fn emit_bcond_label(&mut self, condition: Condition, label: Label) {
+        match condition {
+            Condition::Eq => dynasm!(self ; b.eq => label),
+            Condition::Ne => dynasm!(self ; b.ne => label),
+            Condition::Cs => dynasm!(self ; b.cs => label),
+            Condition::Cc => dynasm!(self ; b.cc => label),
+            Condition::Mi => dynasm!(self ; b.mi => label),
+            Condition::Pl => dynasm!(self ; b.pl => label),
+            Condition::Vs => dynasm!(self ; b.vs => label),
+            Condition::Vc => dynasm!(self ; b.vc => label),
+            Condition::Hi => dynasm!(self ; b.hi => label),
+            Condition::Ls => dynasm!(self ; b.ls => label),
+            Condition::Ge => dynasm!(self ; b.ge => label),
+            Condition::Lt => dynasm!(self ; b.lt => label),
+            Condition::Gt => dynasm!(self ; b.gt => label),
+            Condition::Le => dynasm!(self ; b.le => label),
+            Condition::Al => dynasm!(self ; b => label),
+        }
+    }
+    fn emit_b_register(&mut self, reg: GPR) {
+        dynasm!(self ; br X(reg.into_index() as u32));
+    }
+    fn emit_call_label(&mut self, label: Label) {
+        dynasm!(self ; bl =>label);
+    }
+    fn emit_call_register(&mut self, reg: GPR) {
+        dynasm!(self ; blr X(reg.into_index() as u32));
+    }
+    fn emit_ret(&mut self) {
+        dynasm!(self ; ret);
+    }
+
+    fn emit_udf(&mut self) {
+        dynasm!(self ; udf 0x1234);
+    }
+    fn emit_dmb(&mut self) {
+        dynasm!(self ; dmb ish);
+    }
+    fn emit_brk(&mut self) {
+        dynasm!(self ; brk 0);
+    }
+
+    fn emit_fcmp(&mut self, sz: Size, src1: Location, src2: Location) {
+        match (sz, src1, src2) {
+            (Size::S32, Location::SIMD(src1), Location::SIMD(src2)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                dynasm!(self ; fcmp S(src1), S(src2));
+            }
+            (Size::S64, Location::SIMD(src1), Location::SIMD(src2)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                dynasm!(self ; fcmp D(src1), D(src2));
+            }
+            _ => panic!("singlepass can't emit FCMP {:?} {:?} {:?}", sz, src1, src2),
+        }
+    }
+
+    fn emit_fneg(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S32, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fneg S(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fneg D(dst), D(src));
+            }
+            _ => panic!("singlepass can't emit FNEG {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+    fn emit_fsqrt(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S32, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fsqrt S(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fsqrt D(dst), D(src));
+            }
+            _ => panic!("singlepass can't emit FSQRT {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+
+    fn emit_fadd(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S32, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fadd S(dst), S(src1), S(src2));
+            }
+            (Size::S64, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fadd D(dst), D(src1), D(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit FADD {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_fsub(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S32, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fsub S(dst), S(src1), S(src2));
+            }
+            (Size::S64, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fsub D(dst), D(src1), D(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit FSUB {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_fmul(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S32, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fmul S(dst), S(src1), S(src2));
+            }
+            (Size::S64, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fmul D(dst), D(src1), D(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit FMUL {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_fdiv(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S32, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fdiv S(dst), S(src1), S(src2));
+            }
+            (Size::S64, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fdiv D(dst), D(src1), D(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit FDIV {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+
+    fn emit_fmin(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S32, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fmin S(dst), S(src1), S(src2));
+            }
+            (Size::S64, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fmin D(dst), D(src1), D(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit FMIN {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+    fn emit_fmax(&mut self, sz: Size, src1: Location, src2: Location, dst: Location) {
+        match (sz, src1, src2, dst) {
+            (Size::S32, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fmax S(dst), S(src1), S(src2));
+            }
+            (Size::S64, Location::SIMD(src1), Location::SIMD(src2), Location::SIMD(dst)) => {
+                let src1 = src1.into_index() as u32;
+                let src2 = src2.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fmax D(dst), D(src1), D(src2));
+            }
+            _ => panic!(
+                "singlepass can't emit FMAX {:?} {:?} {:?} {:?}",
+                sz, src1, src2, dst
+            ),
+        }
+    }
+
+    fn emit_frintz(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S32, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; frintz S(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; frintz D(dst), D(src));
+            }
+            _ => panic!("singlepass can't emit FRINTZ {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+    fn emit_frintn(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S32, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; frintn S(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; frintn D(dst), D(src));
+            }
+            _ => panic!("singlepass can't emit FRINTN {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+    fn emit_frintm(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S32, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; frintm S(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; frintm D(dst), D(src));
+            }
+            _ => panic!("singlepass can't emit FRINTM {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+    fn emit_frintp(&mut self, sz: Size, src: Location, dst: Location) {
+        match (sz, src, dst) {
+            (Size::S32, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; frintp S(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; frintp D(dst), D(src));
+            }
+            _ => panic!("singlepass can't emit FRINTP {:?} {:?} {:?}", sz, src, dst),
+        }
+    }
+
+    fn emit_scvtf(&mut self, sz_in: Size, src: Location, sz_out: Size, dst: Location) {
+        match (sz_in, src, sz_out, dst) {
+            (Size::S32, Location::GPR(src), Size::S32, Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; scvtf S(dst), W(src));
+            }
+            (Size::S64, Location::GPR(src), Size::S32, Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; scvtf S(dst), X(src));
+            }
+            (Size::S32, Location::GPR(src), Size::S64, Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; scvtf D(dst), W(src));
+            }
+            (Size::S64, Location::GPR(src), Size::S64, Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; scvtf D(dst), X(src));
+            }
+            _ => panic!(
+                "singlepass can't emit SCVTF {:?} {:?} {:?} {:?}",
+                sz_in, src, sz_out, dst
+            ),
+        }
+    }
+    fn emit_ucvtf(&mut self, sz_in: Size, src: Location, sz_out: Size, dst: Location) {
+        match (sz_in, src, sz_out, dst) {
+            (Size::S32, Location::GPR(src), Size::S32, Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; ucvtf S(dst), W(src));
+            }
+            (Size::S64, Location::GPR(src), Size::S32, Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; ucvtf S(dst), X(src));
+            }
+            (Size::S32, Location::GPR(src), Size::S64, Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; ucvtf D(dst), W(src));
+            }
+            (Size::S64, Location::GPR(src), Size::S64, Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; ucvtf D(dst), X(src));
+            }
+            _ => panic!(
+                "singlepass can't emit UCVTF {:?} {:?} {:?} {:?}",
+                sz_in, src, sz_out, dst
+            ),
+        }
+    }
+    fn emit_fcvt(&mut self, sz_in: Size, src: Location, dst: Location) {
+        match (sz_in, src, dst) {
+            (Size::S32, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fcvt D(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Location::SIMD(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fcvt S(dst), D(src));
+            }
+            _ => panic!(
+                "singlepass can't emit UCVTF {:?} {:?} {:?}",
+                sz_in, src, dst
+            ),
+        }
+    }
+    fn emit_fcvtzs(&mut self, sz_in: Size, src: Location, sz_out: Size, dst: Location) {
+        match (sz_in, src, sz_out, dst) {
+            (Size::S32, Location::SIMD(src), Size::S32, Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fcvtzs W(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Size::S32, Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fcvtzs W(dst), D(src));
+            }
+            (Size::S32, Location::SIMD(src), Size::S64, Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fcvtzs X(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Size::S64, Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fcvtzs X(dst), D(src));
+            }
+            _ => panic!(
+                "singlepass can't emit FCVTZS {:?} {:?} {:?} {:?}",
+                sz_in, src, sz_out, dst
+            ),
+        }
+    }
+    fn emit_fcvtzu(&mut self, sz_in: Size, src: Location, sz_out: Size, dst: Location) {
+        match (sz_in, src, sz_out, dst) {
+            (Size::S32, Location::SIMD(src), Size::S32, Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fcvtzu W(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Size::S32, Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fcvtzu W(dst), D(src));
+            }
+            (Size::S32, Location::SIMD(src), Size::S64, Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fcvtzu X(dst), S(src));
+            }
+            (Size::S64, Location::SIMD(src), Size::S64, Location::GPR(dst)) => {
+                let src = src.into_index() as u32;
+                let dst = dst.into_index() as u32;
+                dynasm!(self ; fcvtzu X(dst), D(src));
+            }
+            _ => panic!(
+                "singlepass can't emit FCVTZU {:?} {:?} {:?} {:?}",
+                sz_in, src, sz_out, dst
+            ),
+        }
+    }
+
+    // 1 011 0100 0100 000 => fpcr
+    fn emit_read_fpcr(&mut self, reg: GPR) {
+        dynasm!(self ; mrs X(reg as u32), 0b1_011_0100_0100_000);
+    }
+    fn emit_write_fpcr(&mut self, reg: GPR) {
+        dynasm!(self ; msr 0b1_011_0100_0100_000, X(reg as u32));
+    }
+    // 1 011 0100 0100 001 => fpsr
+    fn emit_read_fpsr(&mut self, reg: GPR) {
+        dynasm!(self ; mrs X(reg as u32), 0b1_011_0100_0100_001);
+    }
+    fn emit_write_fpsr(&mut self, reg: GPR) {
+        dynasm!(self ; msr 0b1_011_0100_0100_001, X(reg as u32));
+    }
+}
+
+pub fn gen_std_trampoline_arm64(
+    sig: &FunctionType,
+    calling_convention: CallingConvention,
+) -> FunctionBody {
+    let mut a = Assembler::new(0);
+
+    let fptr = GPR::X27;
+    let args = GPR::X28;
+
+    dynasm!(a
+        ; sub sp, sp, 32
+        ; stp x29, x30, [sp]
+        ; stp X(fptr as u32), X(args as u32), [sp, 16]
+        ; mov x29, sp
+        ; mov X(fptr as u32), x1
+        ; mov X(args as u32), x2
+    );
+
+    let stack_args = sig.params().len().saturating_sub(7); //1st arg is ctx, not an actual arg
+    let mut stack_offset = stack_args as u32 * 8;
+    if stack_args > 0 {
+        if stack_offset % 16 != 0 {
+            stack_offset += 8;
+            assert!(stack_offset % 16 == 0);
+        }
+        dynasm!(a ; sub sp, sp, stack_offset);
+    }
+
+    // Move arguments to their locations.
+    // `callee_vmctx` is already in the first argument register, so no need to move.
+    let mut caller_stack_offset: i32 = 0;
+    for (i, param) in sig.params().iter().enumerate() {
+        let sz = match *param {
+            Type::I32 | Type::F32 => Size::S32,
+            Type::I64 | Type::F64 => Size::S64,
+            Type::ExternRef => Size::S64,
+            Type::FuncRef => Size::S64,
+            _ => panic!(
+                "singlepass unsupported param type for trampoline {:?}",
+                *param
+            ),
+        };
+        match i {
+            0..=6 => {
+                a.emit_ldr(
+                    sz,
+                    Location::GPR(GPR::from_index(i + 1).unwrap()),
+                    Location::Memory(args, (i * 16) as i32),
+                );
+            }
+            _ => {
+                match calling_convention {
+                    CallingConvention::AppleAarch64 => {
+                        match sz {
+                            Size::S8 => (),
+                            Size::S16 => {
+                                if caller_stack_offset & 1 != 0 {
+                                    caller_stack_offset = (caller_stack_offset + 1) & !1;
+                                }
+                            }
+                            Size::S32 => {
+                                if caller_stack_offset & 3 != 0 {
+                                    caller_stack_offset = (caller_stack_offset + 3) & !3;
+                                }
+                            }
+                            Size::S64 => {
+                                if caller_stack_offset & 7 != 0 {
+                                    caller_stack_offset = (caller_stack_offset + 7) & !7;
+                                }
+                            }
+                        };
+                    }
+                    _ => (),
+                };
+                // using X16 as scratch reg
+                a.emit_ldr(
+                    sz,
+                    Location::GPR(GPR::X16),
+                    Location::Memory(args, (i * 16) as i32),
+                );
+                a.emit_str(
+                    sz,
+                    Location::GPR(GPR::X16),
+                    Location::Memory(GPR::XzrSp, caller_stack_offset),
+                );
+                match calling_convention {
+                    CallingConvention::AppleAarch64 => {
+                        caller_stack_offset += match sz {
+                            Size::S8 => 1,
+                            Size::S16 => 2,
+                            Size::S32 => 4,
+                            Size::S64 => 8,
+                        };
+                    }
+                    _ => {
+                        caller_stack_offset += 8;
+                    }
+                }
+            }
+        }
+    }
+
+    dynasm!(a  ; blr X(fptr as u32));
+
+    // Write return value.
+    if !sig.results().is_empty() {
+        a.emit_str(Size::S64, Location::GPR(GPR::X0), Location::Memory(args, 0));
+    }
+
+    // Restore stack.
+    dynasm!(a
+        ; ldp X(fptr as u32), X(args as u32), [x29, 16]
+        ; ldp x29, x30, [x29]
+        ; add sp, sp, 32 + stack_offset as u32
+        ; ret
+    );
+
+    FunctionBody {
+        body: a.finalize().unwrap().to_vec(),
+        unwind_info: None,
+    }
+}
+// Generates dynamic import function call trampoline for a function type.
+pub fn gen_std_dynamic_import_trampoline_arm64(
+    vmoffsets: &VMOffsets,
+    sig: &FunctionType,
+    calling_convention: CallingConvention,
+) -> FunctionBody {
+    let mut a = Assembler::new(0);
+    // Allocate argument array.
+    let stack_offset: usize = 16 * std::cmp::max(sig.params().len(), sig.results().len());
+    // Save LR and X26, as scratch register
+    a.emit_stpdb(
+        Size::S64,
+        Location::GPR(GPR::X30),
+        Location::GPR(GPR::X26),
+        GPR::XzrSp,
+        16,
+    );
+
+    if stack_offset != 0 {
+        if stack_offset < 0x1000 {
+            a.emit_sub(
+                Size::S64,
+                Location::GPR(GPR::XzrSp),
+                Location::Imm32(stack_offset as _),
+                Location::GPR(GPR::XzrSp),
+            );
+        } else {
+            a.emit_mov_imm(Location::GPR(GPR::X26), stack_offset as u64);
+            a.emit_sub(
+                Size::S64,
+                Location::GPR(GPR::XzrSp),
+                Location::GPR(GPR::X26),
+                Location::GPR(GPR::XzrSp),
+            );
+        }
+    }
+
+    // Copy arguments.
+    if !sig.params().is_empty() {
+        let mut argalloc = ArgumentRegisterAllocator::default();
+        argalloc.next(Type::I64, calling_convention).unwrap(); // skip VMContext
+
+        let mut stack_param_count: usize = 0;
+
+        for (i, ty) in sig.params().iter().enumerate() {
+            let source_loc = match argalloc.next(*ty, calling_convention) {
+                Some(ARM64Register::GPR(gpr)) => Location::GPR(gpr),
+                Some(ARM64Register::NEON(neon)) => Location::SIMD(neon),
+                None => {
+                    let sz = match calling_convention {
+                        CallingConvention::AppleAarch64 => match *ty {
+                            Type::I32 | Type::F32 => Size::S32,
+                            _ => {
+                                if stack_param_count & 7 != 0 {
+                                    stack_param_count = (stack_param_count + 7) & !7;
+                                };
+                                Size::S64
+                            }
+                        },
+                        _ => Size::S64,
+                    };
+                    a.emit_ldr(
+                        sz,
+                        Location::GPR(GPR::X26),
+                        Location::Memory(GPR::XzrSp, (stack_offset + 16 + stack_param_count) as _),
+                    );
+                    stack_param_count += match sz {
+                        Size::S32 => 4,
+                        Size::S64 => 8,
+                        _ => unreachable!(),
+                    };
+                    Location::GPR(GPR::X26)
+                }
+            };
+            a.emit_str(
+                Size::S64,
+                source_loc,
+                Location::Memory(GPR::XzrSp, (i * 16) as _),
+            );
+
+            // Zero upper 64 bits.
+            a.emit_str(
+                Size::S64,
+                Location::GPR(GPR::XzrSp),                       // XZR here
+                Location::Memory(GPR::XzrSp, (i * 16 + 8) as _), // XSP here
+            );
+        }
+    }
+
+    match calling_convention {
+        _ => {
+            // Load target address.
+            let offset = vmoffsets.vmdynamicfunction_import_context_address();
+            a.emit_ldur(Size::S64, Location::GPR(GPR::X26), GPR::X0, offset as i32);
+            // Load values array.
+            a.emit_add(
+                Size::S64,
+                Location::GPR(GPR::XzrSp),
+                Location::Imm8(0),
+                Location::GPR(GPR::X1),
+            );
+        }
+    };
+
+    // Call target.
+    a.emit_call_register(GPR::X26);
+
+    // Fetch return value.
+    if !sig.results().is_empty() {
+        assert_eq!(sig.results().len(), 1);
+        a.emit_ldr(
+            Size::S64,
+            Location::GPR(GPR::X0),
+            Location::Memory(GPR::XzrSp, 0),
+        );
+    }
+
+    // Release values array.
+    if stack_offset != 0 {
+        if stack_offset < 0x1000 {
+            a.emit_add(
+                Size::S64,
+                Location::GPR(GPR::XzrSp),
+                Location::Imm32(stack_offset as _),
+                Location::GPR(GPR::XzrSp),
+            );
+        } else {
+            a.emit_mov_imm(Location::GPR(GPR::X26), stack_offset as u64);
+            a.emit_add(
+                Size::S64,
+                Location::GPR(GPR::XzrSp),
+                Location::GPR(GPR::X26),
+                Location::GPR(GPR::XzrSp),
+            );
+        }
+    }
+    a.emit_ldpia(
+        Size::S64,
+        Location::GPR(GPR::X30),
+        Location::GPR(GPR::X26),
+        GPR::XzrSp,
+        16,
+    );
+
+    // Return.
+    a.emit_ret();
+
+    FunctionBody {
+        body: a.finalize().unwrap().to_vec(),
+        unwind_info: None,
+    }
+}
+// Singlepass calls import functions through a trampoline.
+pub fn gen_import_call_trampoline_arm64(
+    vmoffsets: &VMOffsets,
+    index: FunctionIndex,
+    sig: &FunctionType,
+    calling_convention: CallingConvention,
+) -> CustomSection {
+    let mut a = Assembler::new(0);
+
+    // Singlepass internally treats all arguments as integers
+    // For the standard System V calling convention requires
+    //  floating point arguments to be passed in NEON registers.
+    //  Translation is expensive, so only do it if needed.
+    if sig
+        .params()
+        .iter()
+        .any(|&x| x == Type::F32 || x == Type::F64)
+    {
+        match calling_convention {
+            _ => {
+                let mut param_locations: Vec<Location> = vec![];
+
+                // Allocate stack space for arguments.
+                let stack_offset: i32 = if sig.params().len() > 7 {
+                    7 * 8
+                } else {
+                    (sig.params().len() as i32) * 8
+                };
+                let stack_offset = if stack_offset & 15 != 0 {
+                    stack_offset + 8
+                } else {
+                    stack_offset
+                };
+                if stack_offset > 0 {
+                    if stack_offset < 0x1000 {
+                        a.emit_sub(
+                            Size::S64,
+                            Location::GPR(GPR::XzrSp),
+                            Location::Imm32(stack_offset as u32),
+                            Location::GPR(GPR::XzrSp),
+                        );
+                    } else {
+                        a.emit_mov_imm(Location::GPR(GPR::X16), stack_offset as u64);
+                        a.emit_sub(
+                            Size::S64,
+                            Location::GPR(GPR::XzrSp),
+                            Location::GPR(GPR::X16),
+                            Location::GPR(GPR::XzrSp),
+                        );
+                    }
+                }
+
+                // Store all arguments to the stack to prevent overwrite.
+                for i in 0..sig.params().len() {
+                    let loc = match i {
+                        0..=6 => {
+                            static PARAM_REGS: &[GPR] = &[
+                                GPR::X1,
+                                GPR::X2,
+                                GPR::X3,
+                                GPR::X4,
+                                GPR::X5,
+                                GPR::X6,
+                                GPR::X7,
+                            ];
+                            let loc = Location::Memory(GPR::XzrSp, (i * 8) as i32);
+                            a.emit_str(Size::S64, Location::GPR(PARAM_REGS[i]), loc);
+                            loc
+                        }
+                        _ => Location::Memory(GPR::XzrSp, stack_offset + ((i - 7) * 8) as i32),
+                    };
+                    param_locations.push(loc);
+                }
+
+                // Copy arguments.
+                let mut caller_stack_offset: i32 = 0;
+                let mut argalloc = ArgumentRegisterAllocator::default();
+                argalloc.next(Type::I64, calling_convention).unwrap(); // skip VMContext
+                for (i, ty) in sig.params().iter().enumerate() {
+                    let prev_loc = param_locations[i];
+                    let targ = match argalloc.next(*ty, calling_convention) {
+                        Some(ARM64Register::GPR(gpr)) => Location::GPR(gpr),
+                        Some(ARM64Register::NEON(neon)) => Location::SIMD(neon),
+                        None => {
+                            // No register can be allocated. Put this argument on the stack.
+                            a.emit_ldr(Size::S64, Location::GPR(GPR::X16), prev_loc);
+                            a.emit_str(
+                                Size::S64,
+                                Location::GPR(GPR::X16),
+                                Location::Memory(GPR::XzrSp, stack_offset + caller_stack_offset),
+                            );
+                            caller_stack_offset += 8;
+                            continue;
+                        }
+                    };
+                    a.emit_ldr(Size::S64, targ, prev_loc);
+                }
+
+                // Restore stack pointer.
+                if stack_offset > 0 {
+                    if stack_offset < 0x1000 {
+                        a.emit_add(
+                            Size::S64,
+                            Location::GPR(GPR::XzrSp),
+                            Location::Imm32(stack_offset as u32),
+                            Location::GPR(GPR::XzrSp),
+                        );
+                    } else {
+                        a.emit_mov_imm(Location::GPR(GPR::X16), stack_offset as u64);
+                        a.emit_add(
+                            Size::S64,
+                            Location::GPR(GPR::XzrSp),
+                            Location::GPR(GPR::X16),
+                            Location::GPR(GPR::XzrSp),
+                        );
+                    }
+                }
+            }
+        }
+    }
+
+    // Emits a tail call trampoline that loads the address of the target import function
+    // from Ctx and jumps to it.
+
+    let offset = vmoffsets.vmctx_vmfunction_import(index);
+    // for ldr, offset needs to be a multiple of 8, wich often is not
+    // so use ldur, but then offset is limited to -255 .. +255. It will be positive here
+    let offset =
+        if (offset > 0 && offset < 0xF8) || (offset > 0 && offset < 0x7FF8 && (offset & 7) == 0) {
+            offset
+        } else {
+            a.emit_mov_imm(Location::GPR(GPR::X16), (offset as i64) as u64);
+            a.emit_add(
+                Size::S64,
+                Location::GPR(GPR::X0),
+                Location::GPR(GPR::X16),
+                Location::GPR(GPR::X0),
+            );
+            0
+        };
+    match calling_convention {
+        _ => {
+            if (offset & 7) == 0 {
+                a.emit_ldr(
+                    Size::S64,
+                    Location::GPR(GPR::X16),
+                    Location::Memory(GPR::X0, offset as i32), // function pointer
+                );
+                a.emit_ldr(
+                    Size::S64,
+                    Location::GPR(GPR::X0),
+                    Location::Memory(GPR::X0, offset as i32 + 8), // target vmctx
+                );
+            } else {
+                a.emit_ldur(
+                    Size::S64,
+                    Location::GPR(GPR::X16),
+                    GPR::X0,
+                    offset as i32, // function pointer
+                );
+                a.emit_ldur(
+                    Size::S64,
+                    Location::GPR(GPR::X0),
+                    GPR::X0,
+                    offset as i32 + 8, // target vmctx
+                );
+            }
+        }
+    }
+    a.emit_b_register(GPR::X16);
+
+    let section_body = SectionBody::new_with_vec(a.finalize().unwrap().to_vec());
+
+    CustomSection {
+        protection: CustomSectionProtection::ReadExecute,
+        bytes: section_body,
+        relocations: vec![],
+    }
+}
diff --git a/lib/compiler-singlepass/src/lib.rs b/lib/compiler-singlepass/src/lib.rs
index 42cde289d36..48329748abf 100644
--- a/lib/compiler-singlepass/src/lib.rs
+++ b/lib/compiler-singlepass/src/lib.rs
@@ -9,13 +9,16 @@
 //! runtime performance.
 
 mod address_map;
+mod arm64_decl;
 mod codegen;
 mod common_decl;
 mod compiler;
 mod config;
+mod emitter_arm64;
 mod emitter_x64;
 mod location;
 mod machine;
+mod machine_arm64;
 mod machine_x64;
 mod x64_decl;
 
diff --git a/lib/compiler-singlepass/src/location.rs b/lib/compiler-singlepass/src/location.rs
index bc5277d2cad..61817068a49 100644
--- a/lib/compiler-singlepass/src/location.rs
+++ b/lib/compiler-singlepass/src/location.rs
@@ -71,9 +71,4 @@ pub trait CombinedRegister: Copy + Clone + Eq + PartialEq + Debug {
     fn from_gpr(x: u16) -> Self;
     /// Convert from an SIMD register
     fn from_simd(x: u16) -> Self;
-    /// Returns the instruction prefix for move to stack
-    /// for example `movq %this_reg, ?(%rsp)` on x86_64
-    /// To build an instruction, append the memory location as a 32-bit
-    /// offset to the stack pointer to this prefix.
-    fn _prefix_mov_to_stack(&self) -> Option<&'static [u8]>;
 }
diff --git a/lib/compiler-singlepass/src/machine.rs b/lib/compiler-singlepass/src/machine.rs
index 67b940749c9..a68daa91ec5 100644
--- a/lib/compiler-singlepass/src/machine.rs
+++ b/lib/compiler-singlepass/src/machine.rs
@@ -1,5 +1,6 @@
 use crate::common_decl::*;
 use crate::location::{Location, Reg};
+use crate::machine_arm64::MachineARM64;
 use crate::machine_x64::MachineX86_64;
 use dynasmrt::{AssemblyOffset, DynamicLabel};
 use std::collections::BTreeMap;
@@ -81,8 +82,8 @@ pub trait Machine {
     fn reserve_unused_temp_gpr(&mut self, gpr: Self::GPR) -> Self::GPR;
     /// reserve a GPR
     fn reserve_gpr(&mut self, gpr: Self::GPR);
-    /// Push used gpr to the stack
-    fn push_used_gpr(&mut self);
+    /// Push used gpr to the stack. Return the bytes taken on the stack
+    fn push_used_gpr(&mut self) -> usize;
     /// Pop used gpr to the stack
     fn pop_used_gpr(&mut self);
     /// Picks an unused SIMD register.
@@ -99,10 +100,12 @@ pub trait Machine {
     fn reserve_simd(&mut self, simd: Self::SIMD);
     /// Releases a temporary XMM register.
     fn release_simd(&mut self, simd: Self::SIMD);
-    /// Push used simd regs to the stack
-    fn push_used_simd(&mut self);
+    /// Push used simd regs to the stack. Return bytes taken on the stack
+    fn push_used_simd(&mut self) -> usize;
     /// Pop used simd regs to the stack
     fn pop_used_simd(&mut self);
+    /// Return a rounded stack adjustement value (must be multiple of 16bytes on ARM64 for example)
+    fn round_stack_adjust(&self, value: usize) -> usize;
     /// Set the source location of the Wasm to the given offset.
     fn set_srcloc(&mut self, offset: u32);
     /// Marks each address in the code range emitted by `f` with the trap code `code`.
@@ -129,10 +132,6 @@ pub trait Machine {
     /// restore stack
     /// Like assembler.emit_add(Size::S64, Location::Imm32(delta_stack_offset as u32), Location::GPR(GPR::RSP))
     fn restore_stack(&mut self, delta_stack_offset: u32);
-    /// push callee saved register to the stack
-    fn push_callee_saved(&mut self);
-    /// pop callee saved register from the stack
-    fn pop_callee_saved(&mut self);
     /// Pop stack of locals
     /// Like assembler.emit_add(Size::S64, Location::Imm32(delta_stack_offset as u32), Location::GPR(GPR::RSP))
     fn pop_stack_locals(&mut self, delta_stack_offset: u32);
@@ -158,8 +157,24 @@ pub trait Machine {
         &self,
         calling_convention: CallingConvention,
     ) -> Vec<Location<Self::GPR, Self::SIMD>>;
-    /// Get param location
+    /// Get param location (to build a call, using SP for stack args)
     fn get_param_location(
+        &self,
+        idx: usize,
+        sz: Size,
+        stack_offset: &mut usize,
+        calling_convention: CallingConvention,
+    ) -> Location<Self::GPR, Self::SIMD>;
+    /// Get call param location (from a call, using FP for stack args)
+    fn get_call_param_location(
+        &self,
+        idx: usize,
+        sz: Size,
+        stack_offset: &mut usize,
+        calling_convention: CallingConvention,
+    ) -> Location<Self::GPR, Self::SIMD>;
+    /// Get simple param location
+    fn get_simple_param_location(
         &self,
         idx: usize,
         calling_convention: CallingConvention,
@@ -259,6 +274,10 @@ pub trait Machine {
     fn get_gpr_for_ret(&self) -> Self::GPR;
     /// get the simd for the return of float/double values
     fn get_simd_for_ret(&self) -> Self::SIMD;
+
+    /// Emit a debug breakpoint
+    fn emit_debug_breakpoint(&mut self);
+
     /// load the address of a memory location (will panic if src is not a memory)
     /// like LEA opcode on x86_64
     fn location_address(
@@ -430,6 +449,7 @@ pub trait Machine {
         loc_b: Location<Self::GPR, Self::SIMD>,
         ret: Location<Self::GPR, Self::SIMD>,
         integer_division_by_zero: Label,
+        integer_overflow: Label,
     ) -> usize;
     /// Signed Division with location directly from the stack. return the offset of the DIV opcode, to mark as trappable.
     fn emit_binop_sdiv32(
@@ -438,6 +458,7 @@ pub trait Machine {
         loc_b: Location<Self::GPR, Self::SIMD>,
         ret: Location<Self::GPR, Self::SIMD>,
         integer_division_by_zero: Label,
+        integer_overflow: Label,
     ) -> usize;
     /// Unsigned Reminder (of a division) with location directly from the stack. return the offset of the DIV opcode, to mark as trappable.
     fn emit_binop_urem32(
@@ -446,6 +467,7 @@ pub trait Machine {
         loc_b: Location<Self::GPR, Self::SIMD>,
         ret: Location<Self::GPR, Self::SIMD>,
         integer_division_by_zero: Label,
+        integer_overflow: Label,
     ) -> usize;
     /// Signed Reminder (of a Division) with location directly from the stack. return the offset of the DIV opcode, to mark as trappable.
     fn emit_binop_srem32(
@@ -454,6 +476,7 @@ pub trait Machine {
         loc_b: Location<Self::GPR, Self::SIMD>,
         ret: Location<Self::GPR, Self::SIMD>,
         integer_division_by_zero: Label,
+        integer_overflow: Label,
     ) -> usize;
     /// And with location directly from the stack
     fn emit_binop_and32(
@@ -1043,6 +1066,7 @@ pub trait Machine {
         loc_b: Location<Self::GPR, Self::SIMD>,
         ret: Location<Self::GPR, Self::SIMD>,
         integer_division_by_zero: Label,
+        integer_overflow: Label,
     ) -> usize;
     /// Signed Division with location directly from the stack. return the offset of the DIV opcode, to mark as trappable.
     fn emit_binop_sdiv64(
@@ -1051,6 +1075,7 @@ pub trait Machine {
         loc_b: Location<Self::GPR, Self::SIMD>,
         ret: Location<Self::GPR, Self::SIMD>,
         integer_division_by_zero: Label,
+        integer_overflow: Label,
     ) -> usize;
     /// Unsigned Reminder (of a division) with location directly from the stack. return the offset of the DIV opcode, to mark as trappable.
     fn emit_binop_urem64(
@@ -1059,6 +1084,7 @@ pub trait Machine {
         loc_b: Location<Self::GPR, Self::SIMD>,
         ret: Location<Self::GPR, Self::SIMD>,
         integer_division_by_zero: Label,
+        integer_overflow: Label,
     ) -> usize;
     /// Signed Reminder (of a Division) with location directly from the stack. return the offset of the DIV opcode, to mark as trappable.
     fn emit_binop_srem64(
@@ -1067,6 +1093,7 @@ pub trait Machine {
         loc_b: Location<Self::GPR, Self::SIMD>,
         ret: Location<Self::GPR, Self::SIMD>,
         integer_division_by_zero: Label,
+        integer_overflow: Label,
     ) -> usize;
     /// And with location directly from the stack
     fn emit_binop_and64(
@@ -2166,11 +2193,17 @@ pub fn gen_std_trampoline(
     target: &Target,
     calling_convention: CallingConvention,
 ) -> FunctionBody {
-    let machine = match target.triple().architecture {
-        Architecture::X86_64 => MachineX86_64::new(),
+    match target.triple().architecture {
+        Architecture::X86_64 => {
+            let machine = MachineX86_64::new();
+            machine.gen_std_trampoline(sig, calling_convention)
+        }
+        Architecture::Aarch64(_) => {
+            let machine = MachineARM64::new();
+            machine.gen_std_trampoline(sig, calling_convention)
+        }
         _ => unimplemented!(),
-    };
-    machine.gen_std_trampoline(sig, calling_convention)
+    }
 }
 /// Generates dynamic import function call trampoline for a function type.
 pub fn gen_std_dynamic_import_trampoline(
@@ -2179,11 +2212,17 @@ pub fn gen_std_dynamic_import_trampoline(
     target: &Target,
     calling_convention: CallingConvention,
 ) -> FunctionBody {
-    let machine = match target.triple().architecture {
-        Architecture::X86_64 => MachineX86_64::new(),
+    match target.triple().architecture {
+        Architecture::X86_64 => {
+            let machine = MachineX86_64::new();
+            machine.gen_std_dynamic_import_trampoline(vmoffsets, sig, calling_convention)
+        }
+        Architecture::Aarch64(_) => {
+            let machine = MachineARM64::new();
+            machine.gen_std_dynamic_import_trampoline(vmoffsets, sig, calling_convention)
+        }
         _ => unimplemented!(),
-    };
-    machine.gen_std_dynamic_import_trampoline(vmoffsets, sig, calling_convention)
+    }
 }
 /// Singlepass calls import functions through a trampoline.
 pub fn gen_import_call_trampoline(
@@ -2193,9 +2232,54 @@ pub fn gen_import_call_trampoline(
     target: &Target,
     calling_convention: CallingConvention,
 ) -> CustomSection {
-    let machine = match target.triple().architecture {
-        Architecture::X86_64 => MachineX86_64::new(),
+    match target.triple().architecture {
+        Architecture::X86_64 => {
+            let machine = MachineX86_64::new();
+            machine.gen_import_call_trampoline(vmoffsets, index, sig, calling_convention)
+        }
+        Architecture::Aarch64(_) => {
+            let machine = MachineARM64::new();
+            machine.gen_import_call_trampoline(vmoffsets, index, sig, calling_convention)
+        }
         _ => unimplemented!(),
-    };
-    machine.gen_import_call_trampoline(vmoffsets, index, sig, calling_convention)
+    }
 }
+
+// Constants for the bounds of truncation operations. These are the least or
+// greatest exact floats in either f32 or f64 representation less-than (for
+// least) or greater-than (for greatest) the i32 or i64 or u32 or u64
+// min (for least) or max (for greatest), when rounding towards zero.
+
+/// Greatest Exact Float (32 bits) less-than i32::MIN when rounding towards zero.
+pub const GEF32_LT_I32_MIN: f32 = -2147483904.0;
+/// Least Exact Float (32 bits) greater-than i32::MAX when rounding towards zero.
+pub const LEF32_GT_I32_MAX: f32 = 2147483648.0;
+/// Greatest Exact Float (32 bits) less-than i64::MIN when rounding towards zero.
+pub const GEF32_LT_I64_MIN: f32 = -9223373136366403584.0;
+/// Least Exact Float (32 bits) greater-than i64::MAX when rounding towards zero.
+pub const LEF32_GT_I64_MAX: f32 = 9223372036854775808.0;
+/// Greatest Exact Float (32 bits) less-than u32::MIN when rounding towards zero.
+pub const GEF32_LT_U32_MIN: f32 = -1.0;
+/// Least Exact Float (32 bits) greater-than u32::MAX when rounding towards zero.
+pub const LEF32_GT_U32_MAX: f32 = 4294967296.0;
+/// Greatest Exact Float (32 bits) less-than u64::MIN when rounding towards zero.
+pub const GEF32_LT_U64_MIN: f32 = -1.0;
+/// Least Exact Float (32 bits) greater-than u64::MAX when rounding towards zero.
+pub const LEF32_GT_U64_MAX: f32 = 18446744073709551616.0;
+
+/// Greatest Exact Float (64 bits) less-than i32::MIN when rounding towards zero.
+pub const GEF64_LT_I32_MIN: f64 = -2147483649.0;
+/// Least Exact Float (64 bits) greater-than i32::MAX when rounding towards zero.
+pub const LEF64_GT_I32_MAX: f64 = 2147483648.0;
+/// Greatest Exact Float (64 bits) less-than i64::MIN when rounding towards zero.
+pub const GEF64_LT_I64_MIN: f64 = -9223372036854777856.0;
+/// Least Exact Float (64 bits) greater-than i64::MAX when rounding towards zero.
+pub const LEF64_GT_I64_MAX: f64 = 9223372036854775808.0;
+/// Greatest Exact Float (64 bits) less-than u32::MIN when rounding towards zero.
+pub const GEF64_LT_U32_MIN: f64 = -1.0;
+/// Least Exact Float (64 bits) greater-than u32::MAX when rounding towards zero.
+pub const LEF64_GT_U32_MAX: f64 = 4294967296.0;
+/// Greatest Exact Float (64 bits) less-than u64::MIN when rounding towards zero.
+pub const GEF64_LT_U64_MIN: f64 = -1.0;
+/// Least Exact Float (64 bits) greater-than u64::MAX when rounding towards zero.
+pub const LEF64_GT_U64_MAX: f64 = 18446744073709551616.0;
diff --git a/lib/compiler-singlepass/src/machine_arm64.rs b/lib/compiler-singlepass/src/machine_arm64.rs
new file mode 100644
index 00000000000..eca8e1f5777
--- /dev/null
+++ b/lib/compiler-singlepass/src/machine_arm64.rs
@@ -0,0 +1,5025 @@
+use crate::arm64_decl::new_machine_state;
+use crate::arm64_decl::{GPR, NEON};
+use crate::common_decl::*;
+use crate::emitter_arm64::*;
+use crate::location::Location as AbstractLocation;
+use crate::machine::*;
+use dynasmrt::{aarch64::Aarch64Relocation, VecAssembler};
+use std::collections::HashSet;
+use wasmer_compiler::wasmparser::Type as WpType;
+use wasmer_compiler::{
+    CallingConvention, CustomSection, FunctionBody, InstructionAddressMap, Relocation,
+    RelocationKind, RelocationTarget, SourceLoc, TrapInformation,
+};
+use wasmer_types::{FunctionIndex, FunctionType};
+use wasmer_vm::{TrapCode, VMOffsets};
+
+type Assembler = VecAssembler<Aarch64Relocation>;
+type Location = AbstractLocation<GPR, NEON>;
+
+pub struct MachineARM64 {
+    assembler: Assembler,
+    used_gprs: HashSet<GPR>,
+    used_simd: HashSet<NEON>,
+    trap_table: TrapTable,
+    /// Map from byte offset into wasm function to range of native instructions.
+    ///
+    // Ordered by increasing InstructionAddressMap::srcloc.
+    instructions_address_map: Vec<InstructionAddressMap>,
+    /// The source location for the current operator.
+    src_loc: u32,
+    /// is last push on a 8byte multiple or 16bytes?
+    pushed: bool,
+}
+
+#[allow(dead_code)]
+#[derive(PartialEq)]
+enum ImmType {
+    None,
+    NoneXzr,
+    Bits8,
+    Bits12,
+    Shift32,
+    Shift32No0,
+    Shift64,
+    Shift64No0,
+    Logical32,
+    Logical64,
+    UnscaledOffset,
+    OffsetByte,
+    OffsetHWord,
+    OffsetWord,
+    OffsetDWord,
+}
+
+#[allow(dead_code)]
+impl MachineARM64 {
+    pub fn new() -> Self {
+        MachineARM64 {
+            assembler: Assembler::new(0),
+            used_gprs: HashSet::new(),
+            used_simd: HashSet::new(),
+            trap_table: TrapTable::default(),
+            instructions_address_map: vec![],
+            src_loc: 0,
+            pushed: false,
+        }
+    }
+    fn compatible_imm(&self, imm: i64, ty: ImmType) -> bool {
+        match ty {
+            ImmType::None => false,
+            ImmType::NoneXzr => false,
+            ImmType::Bits8 => (imm >= 0) && (imm < 256),
+            ImmType::Bits12 => (imm >= 0) && (imm < 0x1000),
+            ImmType::Shift32 => (imm >= 0) && (imm < 32),
+            ImmType::Shift32No0 => (imm > 0) && (imm < 32),
+            ImmType::Shift64 => (imm >= 0) && (imm < 64),
+            ImmType::Shift64No0 => (imm > 0) && (imm < 64),
+            ImmType::Logical32 => encode_logical_immediate_32bit(imm as u32).is_some(),
+            ImmType::Logical64 => encode_logical_immediate_64bit(imm as u64).is_some(),
+            ImmType::UnscaledOffset => (imm > -256) && (imm < 256),
+            ImmType::OffsetByte => (imm >= 0) && (imm < 0x1000),
+            ImmType::OffsetHWord => (imm & 1 == 0) && (imm >= 0) && (imm < 0x2000),
+            ImmType::OffsetWord => (imm & 3 == 0) && (imm >= 0) && (imm < 0x4000),
+            ImmType::OffsetDWord => (imm & 7 == 0) && (imm >= 0) && (imm < 0x8000),
+        }
+    }
+
+    fn location_to_reg(
+        &mut self,
+        sz: Size,
+        src: Location,
+        temps: &mut Vec<GPR>,
+        allow_imm: ImmType,
+        read_val: bool,
+        wanted: Option<GPR>,
+    ) -> Location {
+        match src {
+            Location::GPR(_) | Location::SIMD(_) => src,
+            Location::Imm8(val) => {
+                if allow_imm == ImmType::NoneXzr && val == 0 {
+                    Location::GPR(GPR::XzrSp)
+                } else {
+                    if self.compatible_imm(val as i64, allow_imm) {
+                        src
+                    } else {
+                        let tmp = if wanted.is_some() {
+                            wanted.unwrap()
+                        } else {
+                            let tmp = self.acquire_temp_gpr().unwrap();
+                            temps.push(tmp.clone());
+                            tmp
+                        };
+                        self.assembler.emit_mov_imm(Location::GPR(tmp), val as u64);
+                        Location::GPR(tmp)
+                    }
+                }
+            }
+            Location::Imm32(val) => {
+                if allow_imm == ImmType::NoneXzr && val == 0 {
+                    Location::GPR(GPR::XzrSp)
+                } else {
+                    if self.compatible_imm(val as i64, allow_imm) {
+                        src
+                    } else {
+                        let tmp = if wanted.is_some() {
+                            wanted.unwrap()
+                        } else {
+                            let tmp = self.acquire_temp_gpr().unwrap();
+                            temps.push(tmp.clone());
+                            tmp
+                        };
+                        self.assembler
+                            .emit_mov_imm(Location::GPR(tmp), (val as i64) as u64);
+                        Location::GPR(tmp)
+                    }
+                }
+            }
+            Location::Imm64(val) => {
+                if allow_imm == ImmType::NoneXzr && val == 0 {
+                    Location::GPR(GPR::XzrSp)
+                } else {
+                    if self.compatible_imm(val as i64, allow_imm) {
+                        src
+                    } else {
+                        let tmp = if wanted.is_some() {
+                            wanted.unwrap()
+                        } else {
+                            let tmp = self.acquire_temp_gpr().unwrap();
+                            temps.push(tmp.clone());
+                            tmp
+                        };
+                        self.assembler.emit_mov_imm(Location::GPR(tmp), val as u64);
+                        Location::GPR(tmp)
+                    }
+                }
+            }
+            Location::Memory(reg, val) => {
+                let tmp = if wanted.is_some() {
+                    wanted.unwrap()
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    temps.push(tmp.clone());
+                    tmp
+                };
+                if read_val {
+                    let offsize = if sz == Size::S32 {
+                        ImmType::OffsetWord
+                    } else {
+                        ImmType::OffsetDWord
+                    };
+                    if self.compatible_imm(val as i64, offsize) {
+                        self.assembler.emit_ldr(
+                            sz,
+                            Location::GPR(tmp),
+                            Location::Memory(reg, val as _),
+                        );
+                    } else if self.compatible_imm(val as i64, ImmType::UnscaledOffset) {
+                        self.assembler.emit_ldur(sz, Location::GPR(tmp), reg, val);
+                    } else {
+                        if reg == tmp {
+                            unreachable!();
+                        }
+                        self.assembler
+                            .emit_mov_imm(Location::GPR(tmp), (val as i64) as u64);
+                        self.assembler.emit_ldr(
+                            sz,
+                            Location::GPR(tmp),
+                            Location::Memory2(reg, tmp, Multiplier::One, 0),
+                        );
+                    }
+                }
+                Location::GPR(tmp)
+            }
+            _ => panic!("singlepass can't emit location_to_reg {:?} {:?}", sz, src),
+        }
+    }
+    fn location_to_neon(
+        &mut self,
+        sz: Size,
+        src: Location,
+        temps: &mut Vec<NEON>,
+        allow_imm: ImmType,
+        read_val: bool,
+    ) -> Location {
+        match src {
+            Location::SIMD(_) => src,
+            Location::GPR(_) => {
+                let tmp = self.acquire_temp_simd().unwrap();
+                temps.push(tmp.clone());
+                if read_val {
+                    self.assembler.emit_mov(sz, src, Location::SIMD(tmp));
+                }
+                Location::SIMD(tmp)
+            }
+            Location::Imm8(val) => {
+                if self.compatible_imm(val as i64, allow_imm) {
+                    src
+                } else {
+                    let gpr = self.acquire_temp_gpr().unwrap();
+                    let tmp = self.acquire_temp_simd().unwrap();
+                    temps.push(tmp.clone());
+                    self.assembler.emit_mov_imm(Location::GPR(gpr), val as u64);
+                    self.assembler
+                        .emit_mov(sz, Location::GPR(gpr), Location::SIMD(tmp));
+                    self.release_gpr(gpr);
+                    Location::SIMD(tmp)
+                }
+            }
+            Location::Imm32(val) => {
+                if self.compatible_imm(val as i64, allow_imm) {
+                    src
+                } else {
+                    let gpr = self.acquire_temp_gpr().unwrap();
+                    let tmp = self.acquire_temp_simd().unwrap();
+                    temps.push(tmp.clone());
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(gpr), (val as i64) as u64);
+                    self.assembler
+                        .emit_mov(sz, Location::GPR(gpr), Location::SIMD(tmp));
+                    self.release_gpr(gpr);
+                    Location::SIMD(tmp)
+                }
+            }
+            Location::Imm64(val) => {
+                if self.compatible_imm(val as i64, allow_imm) {
+                    src
+                } else {
+                    let gpr = self.acquire_temp_gpr().unwrap();
+                    let tmp = self.acquire_temp_simd().unwrap();
+                    temps.push(tmp.clone());
+                    self.assembler.emit_mov_imm(Location::GPR(gpr), val as u64);
+                    self.assembler
+                        .emit_mov(sz, Location::GPR(gpr), Location::SIMD(tmp));
+                    self.release_gpr(gpr);
+                    Location::SIMD(tmp)
+                }
+            }
+            Location::Memory(reg, val) => {
+                let tmp = self.acquire_temp_simd().unwrap();
+                temps.push(tmp.clone());
+                if read_val {
+                    let offsize = if sz == Size::S32 {
+                        ImmType::OffsetWord
+                    } else {
+                        ImmType::OffsetDWord
+                    };
+                    if self.compatible_imm(val as i64, offsize) {
+                        self.assembler.emit_ldr(
+                            sz,
+                            Location::SIMD(tmp),
+                            Location::Memory(reg, val as _),
+                        );
+                    } else if self.compatible_imm(val as i64, ImmType::UnscaledOffset) {
+                        self.assembler.emit_ldur(sz, Location::SIMD(tmp), reg, val);
+                    } else {
+                        let gpr = self.acquire_temp_gpr().unwrap();
+                        self.assembler
+                            .emit_mov_imm(Location::GPR(gpr), (val as i64) as u64);
+                        self.assembler.emit_ldr(
+                            sz,
+                            Location::SIMD(tmp),
+                            Location::Memory2(reg, gpr, Multiplier::One, 0),
+                        );
+                        self.release_gpr(gpr);
+                    }
+                }
+                Location::SIMD(tmp)
+            }
+            _ => panic!("singlepass can't emit location_to_neon {:?} {:?}", sz, src),
+        }
+    }
+
+    fn emit_relaxed_binop(
+        &mut self,
+        op: fn(&mut Assembler, Size, Location, Location),
+        sz: Size,
+        src: Location,
+        dst: Location,
+        putback: bool,
+    ) {
+        let mut temps = vec![];
+        let src_imm = if putback {
+            ImmType::None
+        } else {
+            ImmType::Bits12
+        };
+        let src = self.location_to_reg(sz, src, &mut temps, src_imm, true, None);
+        let dest = self.location_to_reg(sz, dst, &mut temps, ImmType::None, !putback, None);
+        op(&mut self.assembler, sz, src, dest);
+        if dst != dest && putback {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_binop_neon(
+        &mut self,
+        op: fn(&mut Assembler, Size, Location, Location),
+        sz: Size,
+        src: Location,
+        dst: Location,
+        putback: bool,
+    ) {
+        let mut temps = vec![];
+        let src = self.location_to_neon(sz, src, &mut temps, ImmType::None, true);
+        let dest = self.location_to_neon(sz, dst, &mut temps, ImmType::None, !putback);
+        op(&mut self.assembler, sz, src, dest);
+        if dst != dest && putback {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_simd(r);
+        }
+    }
+    fn emit_relaxed_binop3(
+        &mut self,
+        op: fn(&mut Assembler, Size, Location, Location, Location),
+        sz: Size,
+        src1: Location,
+        src2: Location,
+        dst: Location,
+        allow_imm: ImmType,
+    ) {
+        let mut temps = vec![];
+        let src1 = self.location_to_reg(sz, src1, &mut temps, ImmType::None, true, None);
+        let src2 = self.location_to_reg(sz, src2, &mut temps, allow_imm, true, None);
+        let dest = self.location_to_reg(sz, dst, &mut temps, ImmType::None, false, None);
+        op(&mut self.assembler, sz, src1, src2, dest);
+        if dst != dest {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_binop3_neon(
+        &mut self,
+        op: fn(&mut Assembler, Size, Location, Location, Location),
+        sz: Size,
+        src1: Location,
+        src2: Location,
+        dst: Location,
+        allow_imm: ImmType,
+    ) {
+        let mut temps = vec![];
+        let src1 = self.location_to_neon(sz, src1, &mut temps, ImmType::None, true);
+        let src2 = self.location_to_neon(sz, src2, &mut temps, allow_imm, true);
+        let dest = self.location_to_neon(sz, dst, &mut temps, ImmType::None, false);
+        op(&mut self.assembler, sz, src1, src2, dest);
+        if dst != dest {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_simd(r);
+        }
+    }
+    fn emit_relaxed_ldr64(&mut self, sz: Size, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(sz, dst, &mut temps, ImmType::None, false, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetDWord) {
+                    self.assembler.emit_ldr(Size::S64, dest, src);
+                } else if self.compatible_imm(offset as i64, ImmType::UnscaledOffset) {
+                    self.assembler.emit_ldur(Size::S64, dest, addr, offset);
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_ldr(
+                        Size::S64,
+                        dest,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => unreachable!(),
+        }
+        if dst != dest {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_ldr32(&mut self, sz: Size, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(sz, dst, &mut temps, ImmType::None, false, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetWord) {
+                    self.assembler.emit_ldr(Size::S32, dest, src);
+                } else if self.compatible_imm(offset as i64, ImmType::UnscaledOffset) {
+                    self.assembler.emit_ldur(Size::S32, dest, addr, offset);
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_ldr(
+                        Size::S32,
+                        dest,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => unreachable!(),
+        }
+        if dst != dest {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_ldr32s(&mut self, sz: Size, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(sz, dst, &mut temps, ImmType::None, false, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetWord) {
+                    self.assembler.emit_ldrsw(Size::S64, dest, src);
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_ldrsw(
+                        Size::S64,
+                        dest,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => unreachable!(),
+        }
+        if dst != dest {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_ldr16(&mut self, sz: Size, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(sz, dst, &mut temps, ImmType::None, false, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetHWord) {
+                    self.assembler.emit_ldrh(Size::S32, dest, src);
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_ldrh(
+                        Size::S32,
+                        dest,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => unreachable!(),
+        }
+        if dst != dest {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_ldr16s(&mut self, sz: Size, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(sz, dst, &mut temps, ImmType::None, false, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetHWord) {
+                    self.assembler.emit_ldrsh(sz, dest, src);
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_ldrsh(
+                        sz,
+                        dest,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => unreachable!(),
+        }
+        if dst != dest {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_ldr8(&mut self, sz: Size, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(sz, dst, &mut temps, ImmType::None, false, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetByte) {
+                    self.assembler.emit_ldrb(Size::S32, dest, src);
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_ldrb(
+                        Size::S32,
+                        dest,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => unreachable!(),
+        }
+        if dst != dest {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_ldr8s(&mut self, sz: Size, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(sz, dst, &mut temps, ImmType::None, false, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetByte) {
+                    self.assembler.emit_ldrsb(sz, dest, src);
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_ldrsb(
+                        sz,
+                        dest,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => unreachable!(),
+        }
+        if dst != dest {
+            self.move_location(sz, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_str64(&mut self, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dst = self.location_to_reg(Size::S64, dst, &mut temps, ImmType::NoneXzr, true, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetDWord) {
+                    self.assembler.emit_str(Size::S64, dst, src);
+                } else if self.compatible_imm(offset as i64, ImmType::UnscaledOffset) {
+                    self.assembler.emit_stur(Size::S64, dst, addr, offset);
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_str(
+                        Size::S64,
+                        dst,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => panic!("singlepass can't emit str64 {:?} {:?}", dst, src),
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_str32(&mut self, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dst = self.location_to_reg(Size::S64, dst, &mut temps, ImmType::NoneXzr, true, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetWord) {
+                    self.assembler.emit_str(Size::S32, dst, src);
+                } else if self.compatible_imm(offset as i64, ImmType::UnscaledOffset) {
+                    self.assembler.emit_stur(Size::S32, dst, addr, offset);
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_str(
+                        Size::S32,
+                        dst,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => unreachable!(),
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_str16(&mut self, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dst = self.location_to_reg(Size::S64, dst, &mut temps, ImmType::NoneXzr, true, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetHWord) {
+                    self.assembler.emit_strh(Size::S32, dst, src);
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_strh(
+                        Size::S32,
+                        dst,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => unreachable!(),
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn emit_relaxed_str8(&mut self, dst: Location, src: Location) {
+        let mut temps = vec![];
+        let dst = self.location_to_reg(Size::S64, dst, &mut temps, ImmType::NoneXzr, true, None);
+        match src {
+            Location::Memory(addr, offset) => {
+                if self.compatible_imm(offset as i64, ImmType::OffsetByte) {
+                    self.assembler
+                        .emit_strb(Size::S32, dst, Location::Memory(addr, offset));
+                } else {
+                    let tmp = self.acquire_temp_gpr().unwrap();
+                    self.assembler
+                        .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                    self.assembler.emit_strb(
+                        Size::S32,
+                        dst,
+                        Location::Memory2(addr, tmp, Multiplier::One, 0),
+                    );
+                    temps.push(tmp);
+                }
+            }
+            _ => unreachable!(),
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    /// I64 comparison with.
+    fn emit_cmpop_i64_dynamic_b(
+        &mut self,
+        c: Condition,
+        loc_a: Location,
+        loc_b: Location,
+        ret: Location,
+    ) {
+        match ret {
+            Location::GPR(_) => {
+                self.emit_relaxed_cmp(Size::S64, loc_b, loc_a);
+                self.assembler.emit_cset(Size::S32, ret, c);
+            }
+            Location::Memory(_, _) => {
+                let tmp = self.acquire_temp_gpr().unwrap();
+                self.emit_relaxed_cmp(Size::S64, loc_b, loc_a);
+                self.assembler.emit_cset(Size::S32, Location::GPR(tmp), c);
+                self.move_location(Size::S32, Location::GPR(tmp), ret);
+                self.release_gpr(tmp);
+            }
+            _ => {
+                unreachable!();
+            }
+        }
+    }
+    /// I32 comparison with.
+    fn emit_cmpop_i32_dynamic_b(
+        &mut self,
+        c: Condition,
+        loc_a: Location,
+        loc_b: Location,
+        ret: Location,
+    ) {
+        match ret {
+            Location::GPR(_) => {
+                self.emit_relaxed_cmp(Size::S32, loc_b, loc_a);
+                self.assembler.emit_cset(Size::S32, ret, c);
+            }
+            Location::Memory(_, _) => {
+                let tmp = self.acquire_temp_gpr().unwrap();
+                self.emit_relaxed_cmp(Size::S32, loc_b, loc_a);
+                self.assembler.emit_cset(Size::S32, Location::GPR(tmp), c);
+                self.move_location(Size::S32, Location::GPR(tmp), ret);
+                self.release_gpr(tmp);
+            }
+            _ => {
+                unreachable!();
+            }
+        }
+    }
+
+    fn memory_op<F: FnOnce(&mut Self, GPR)>(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        check_alignment: bool,
+        value_size: usize,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+        cb: F,
+    ) {
+        let tmp_addr = self.acquire_temp_gpr().unwrap();
+
+        // Reusing `tmp_addr` for temporary indirection here, since it's not used before the last reference to `{base,bound}_loc`.
+        let (base_loc, bound_loc) = if imported_memories {
+            // Imported memories require one level of indirection.
+            self.emit_relaxed_binop(
+                Assembler::emit_mov,
+                Size::S64,
+                Location::Memory(self.get_vmctx_reg(), offset),
+                Location::GPR(tmp_addr),
+                true,
+            );
+            (Location::Memory(tmp_addr, 0), Location::Memory(tmp_addr, 8))
+        } else {
+            (
+                Location::Memory(self.get_vmctx_reg(), offset),
+                Location::Memory(self.get_vmctx_reg(), offset + 8),
+            )
+        };
+
+        let tmp_base = self.acquire_temp_gpr().unwrap();
+        let tmp_bound = self.acquire_temp_gpr().unwrap();
+
+        // Load base into temporary register.
+        self.emit_relaxed_ldr64(Size::S64, Location::GPR(tmp_base), base_loc);
+
+        // Load bound into temporary register, if needed.
+        if need_check {
+            self.emit_relaxed_ldr64(Size::S64, Location::GPR(tmp_bound), bound_loc);
+
+            // Wasm -> Effective.
+            // Assuming we never underflow - should always be true on Linux/macOS and Windows >=8,
+            // since the first page from 0x0 to 0x1000 is not accepted by mmap.
+            self.assembler.emit_add(
+                Size::S64,
+                Location::GPR(tmp_bound),
+                Location::GPR(tmp_base),
+                Location::GPR(tmp_bound),
+            );
+            if self.compatible_imm(value_size as _, ImmType::Bits12) {
+                self.assembler.emit_sub(
+                    Size::S64,
+                    Location::GPR(tmp_bound),
+                    Location::GPR(tmp_bound),
+                    Location::Imm32(value_size as _),
+                );
+            } else {
+                let tmp2 = self.acquire_temp_gpr().unwrap();
+                self.assembler
+                    .emit_mov_imm(Location::GPR(tmp2), value_size as u64);
+                self.assembler.emit_sub(
+                    Size::S64,
+                    Location::GPR(tmp_bound),
+                    Location::GPR(tmp2),
+                    Location::GPR(tmp_bound),
+                );
+                self.release_gpr(tmp2);
+            }
+        }
+
+        // Load effective address.
+        // `base_loc` and `bound_loc` becomes INVALID after this line, because `tmp_addr`
+        // might be reused.
+        self.move_location(Size::S32, addr, Location::GPR(tmp_addr));
+
+        // Add offset to memory address.
+        if memarg.offset != 0 {
+            if self.compatible_imm(memarg.offset as _, ImmType::Bits12) {
+                self.assembler.emit_adds(
+                    Size::S32,
+                    Location::Imm32(memarg.offset),
+                    Location::GPR(tmp_addr),
+                    Location::GPR(tmp_addr),
+                );
+            } else {
+                let tmp = self.acquire_temp_gpr().unwrap();
+                self.assembler
+                    .emit_mov_imm(Location::GPR(tmp), memarg.offset as _);
+                self.assembler.emit_adds(
+                    Size::S32,
+                    Location::GPR(tmp_addr),
+                    Location::GPR(tmp),
+                    Location::GPR(tmp_addr),
+                );
+                self.release_gpr(tmp);
+            }
+
+            // Trap if offset calculation overflowed.
+            self.assembler
+                .emit_bcond_label(Condition::Cs, heap_access_oob);
+        }
+
+        // Wasm linear memory -> real memory
+        self.assembler.emit_add(
+            Size::S64,
+            Location::GPR(tmp_base),
+            Location::GPR(tmp_addr),
+            Location::GPR(tmp_addr),
+        );
+
+        if need_check {
+            // Trap if the end address of the requested area is above that of the linear memory.
+            self.assembler
+                .emit_cmp(Size::S64, Location::GPR(tmp_bound), Location::GPR(tmp_addr));
+
+            // `tmp_bound` is inclusive. So trap only if `tmp_addr > tmp_bound`.
+            self.assembler
+                .emit_bcond_label(Condition::Hi, heap_access_oob);
+        }
+
+        self.release_gpr(tmp_bound);
+        self.release_gpr(tmp_base);
+
+        let align = memarg.align;
+        if check_alignment && align != 1 {
+            self.assembler.emit_tst(
+                Size::S64,
+                Location::Imm32((align - 1).into()),
+                Location::GPR(tmp_addr),
+            );
+            self.assembler
+                .emit_bcond_label(Condition::Ne, heap_access_oob);
+        }
+        let begin = self.assembler.get_offset().0;
+        cb(self, tmp_addr);
+        let end = self.assembler.get_offset().0;
+        self.mark_address_range_with_trap_code(TrapCode::HeapAccessOutOfBounds, begin, end);
+
+        self.release_gpr(tmp_addr);
+    }
+
+    /*fn emit_compare_and_swap<F: FnOnce(&mut Self, GPR, GPR)>(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _ret: Location,
+        _memarg: &MemoryImmediate,
+        _value_size: usize,
+        _memory_sz: Size,
+        _stack_sz: Size,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+        _cb: F,
+    ) {
+        unimplemented!();
+    }*/
+
+    fn offset_is_ok(&self, size: Size, offset: i32) -> bool {
+        if offset < 0 {
+            return false;
+        }
+        let shift = match size {
+            Size::S8 => 0,
+            Size::S16 => 1,
+            Size::S32 => 2,
+            Size::S64 => 3,
+        };
+        if offset >= 0x1000 << shift {
+            return false;
+        }
+        if (offset & ((1 << shift) - 1)) != 0 {
+            return false;
+        }
+        return true;
+    }
+
+    fn emit_push(&mut self, sz: Size, src: Location) {
+        match (sz, src) {
+            (Size::S64, Location::GPR(_)) | (Size::S64, Location::SIMD(_)) => {
+                let offset = if self.pushed {
+                    0
+                } else {
+                    self.assembler.emit_sub(
+                        Size::S64,
+                        Location::GPR(GPR::XzrSp),
+                        Location::Imm8(16),
+                        Location::GPR(GPR::XzrSp),
+                    );
+                    8
+                };
+                self.assembler.emit_stur(Size::S64, src, GPR::XzrSp, offset);
+                self.pushed = !self.pushed;
+            }
+            (Size::S64, _) => {
+                let mut temps = vec![];
+                let src = self.location_to_reg(sz, src, &mut temps, ImmType::None, true, None);
+                let offset = if self.pushed {
+                    0
+                } else {
+                    self.assembler.emit_sub(
+                        Size::S64,
+                        Location::GPR(GPR::XzrSp),
+                        Location::Imm8(16),
+                        Location::GPR(GPR::XzrSp),
+                    );
+                    8
+                };
+                self.assembler.emit_stur(Size::S64, src, GPR::XzrSp, offset);
+                self.pushed = !self.pushed;
+                for r in temps {
+                    self.release_gpr(r);
+                }
+            }
+            _ => panic!("singlepass can't emit PUSH {:?} {:?}", sz, src),
+        }
+    }
+    fn emit_double_push(&mut self, sz: Size, src1: Location, src2: Location) {
+        if !self.pushed {
+            match (sz, src1, src2) {
+                (Size::S64, Location::GPR(_), Location::GPR(_)) => {
+                    self.assembler
+                        .emit_stpdb(Size::S64, src1, src2, GPR::XzrSp, 16);
+                }
+                _ => {
+                    self.emit_push(sz, src1);
+                    self.emit_push(sz, src2);
+                }
+            }
+        } else {
+            self.emit_push(sz, src1);
+            self.emit_push(sz, src2);
+        }
+    }
+    fn emit_pop(&mut self, sz: Size, dst: Location) {
+        match (sz, dst) {
+            (Size::S64, Location::GPR(_)) | (Size::S64, Location::SIMD(_)) => {
+                let offset = if self.pushed { 8 } else { 0 };
+                self.assembler.emit_ldur(Size::S64, dst, GPR::XzrSp, offset);
+                if self.pushed {
+                    self.assembler.emit_add(
+                        Size::S64,
+                        Location::GPR(GPR::XzrSp),
+                        Location::Imm8(16),
+                        Location::GPR(GPR::XzrSp),
+                    );
+                }
+                self.pushed = !self.pushed;
+            }
+            _ => panic!("singlepass can't emit PUSH {:?} {:?}", sz, dst),
+        }
+    }
+    fn emit_double_pop(&mut self, sz: Size, dst1: Location, dst2: Location) {
+        if !self.pushed {
+            match (sz, dst1, dst2) {
+                (Size::S64, Location::GPR(_), Location::GPR(_)) => {
+                    self.assembler
+                        .emit_ldpia(Size::S64, dst1, dst2, GPR::XzrSp, 16);
+                }
+                _ => {
+                    self.emit_pop(sz, dst2);
+                    self.emit_pop(sz, dst1);
+                }
+            }
+        } else {
+            self.emit_pop(sz, dst2);
+            self.emit_pop(sz, dst1);
+        }
+    }
+
+    fn set_default_nan(&mut self, temps: &mut Vec<GPR>) -> GPR {
+        // temporarly set FPCR to DefaultNan
+        let old_fpcr = self.acquire_temp_gpr().unwrap();
+        temps.push(old_fpcr.clone());
+        self.assembler.emit_read_fpcr(old_fpcr);
+        let new_fpcr = self.acquire_temp_gpr().unwrap();
+        temps.push(new_fpcr.clone());
+        let tmp = self.acquire_temp_gpr().unwrap();
+        temps.push(tmp.clone());
+        self.assembler
+            .emit_mov(Size::S32, Location::Imm32(1), Location::GPR(tmp));
+        self.assembler
+            .emit_mov(Size::S64, Location::GPR(old_fpcr), Location::GPR(new_fpcr));
+        // DN is bit 25 of FPCR
+        self.assembler.emit_bfi(
+            Size::S64,
+            Location::GPR(tmp),
+            25,
+            1,
+            Location::GPR(new_fpcr),
+        );
+        self.assembler.emit_write_fpcr(new_fpcr);
+        old_fpcr
+    }
+    fn set_trap_enabled(&mut self, temps: &mut Vec<GPR>) -> GPR {
+        // temporarly set FPCR to DefaultNan
+        let old_fpcr = self.acquire_temp_gpr().unwrap();
+        temps.push(old_fpcr.clone());
+        self.assembler.emit_read_fpcr(old_fpcr);
+        let new_fpcr = self.acquire_temp_gpr().unwrap();
+        temps.push(new_fpcr.clone());
+        self.assembler
+            .emit_mov(Size::S64, Location::GPR(old_fpcr), Location::GPR(new_fpcr));
+        // IOE is bit 8 of FPCR
+        self.assembler
+            .emit_bfc(Size::S64, 8, 1, Location::GPR(new_fpcr));
+        self.assembler.emit_write_fpcr(new_fpcr);
+        old_fpcr
+    }
+    fn restore_fpcr(&mut self, old_fpcr: GPR) {
+        self.assembler.emit_write_fpcr(old_fpcr);
+    }
+
+    fn reset_exception_fpsr(&mut self) {
+        // reset exception count in FPSR
+        let fpsr = self.acquire_temp_gpr().unwrap();
+        self.assembler.emit_read_fpsr(fpsr);
+        // IOC is 0
+        self.assembler
+            .emit_bfc(Size::S64, 0, 1, Location::GPR(fpsr));
+        self.assembler.emit_write_fpsr(fpsr);
+        self.release_gpr(fpsr);
+    }
+    fn read_fpsr(&mut self) -> GPR {
+        let fpsr = self.acquire_temp_gpr().unwrap();
+        self.assembler.emit_read_fpsr(fpsr);
+        fpsr
+    }
+
+    fn trap_float_convertion_errors(
+        &mut self,
+        old_fpcr: GPR,
+        sz: Size,
+        f: Location,
+        temps: &mut Vec<GPR>,
+    ) {
+        let trap_badconv = self.assembler.get_label();
+        let end = self.assembler.get_label();
+
+        let fpsr = self.read_fpsr();
+        temps.push(fpsr.clone());
+        // no trap, than all good
+        self.assembler
+            .emit_tbz_label(Size::S32, Location::GPR(fpsr), 0, end);
+        // now need to check if it's overflow or NaN
+        self.assembler
+            .emit_bfc(Size::S64, 0, 4, Location::GPR(fpsr));
+        self.restore_fpcr(old_fpcr);
+        self.assembler.emit_fcmp(sz, f, f);
+        self.assembler.emit_bcond_label(Condition::Vs, trap_badconv);
+        // fallthru: trap_overflow
+        let offset = self.assembler.get_offset().0;
+        self.trap_table
+            .offset_to_code
+            .insert(offset, TrapCode::IntegerOverflow);
+        self.emit_illegal_op();
+        self.mark_instruction_address_end(offset);
+
+        self.emit_label(trap_badconv);
+        let offset = self.assembler.get_offset().0;
+        self.trap_table
+            .offset_to_code
+            .insert(offset, TrapCode::BadConversionToInteger);
+        self.emit_illegal_op();
+        self.mark_instruction_address_end(offset);
+
+        self.emit_label(end);
+        self.restore_fpcr(old_fpcr);
+    }
+}
+
+impl Machine for MachineARM64 {
+    type GPR = GPR;
+    type SIMD = NEON;
+    fn assembler_get_offset(&self) -> Offset {
+        self.assembler.get_offset()
+    }
+    fn index_from_gpr(&self, x: GPR) -> RegisterIndex {
+        RegisterIndex(x as usize)
+    }
+    fn index_from_simd(&self, x: NEON) -> RegisterIndex {
+        RegisterIndex(x as usize + 32)
+    }
+
+    fn get_vmctx_reg(&self) -> GPR {
+        GPR::X28
+    }
+
+    fn get_used_gprs(&self) -> Vec<GPR> {
+        self.used_gprs.iter().cloned().collect()
+    }
+
+    fn get_used_simd(&self) -> Vec<NEON> {
+        self.used_simd.iter().cloned().collect()
+    }
+
+    fn pick_gpr(&self) -> Option<GPR> {
+        use GPR::*;
+        static REGS: &[GPR] = &[X9, X10, X11, X12, X13, X14, X15];
+        for r in REGS {
+            if !self.used_gprs.contains(r) {
+                return Some(*r);
+            }
+        }
+        None
+    }
+
+    // Picks an unused general purpose register for internal temporary use.
+    fn pick_temp_gpr(&self) -> Option<GPR> {
+        use GPR::*;
+        static REGS: &[GPR] = &[X8, X7, X6, X5, X4, X3, X2, X1];
+        for r in REGS {
+            if !self.used_gprs.contains(r) {
+                return Some(*r);
+            }
+        }
+        None
+    }
+
+    fn acquire_temp_gpr(&mut self) -> Option<GPR> {
+        let gpr = self.pick_temp_gpr();
+        if let Some(x) = gpr {
+            self.used_gprs.insert(x);
+        }
+        gpr
+    }
+
+    fn release_gpr(&mut self, gpr: GPR) {
+        assert!(self.used_gprs.remove(&gpr));
+    }
+
+    fn reserve_unused_temp_gpr(&mut self, gpr: GPR) -> GPR {
+        assert!(!self.used_gprs.contains(&gpr));
+        self.used_gprs.insert(gpr);
+        gpr
+    }
+
+    fn reserve_gpr(&mut self, gpr: GPR) {
+        self.used_gprs.insert(gpr);
+    }
+
+    fn push_used_gpr(&mut self) -> usize {
+        let used_gprs = self.get_used_gprs();
+        if used_gprs.len() % 2 == 1 {
+            self.emit_push(Size::S64, Location::GPR(GPR::XzrSp));
+        }
+        for r in used_gprs.iter() {
+            self.emit_push(Size::S64, Location::GPR(*r));
+        }
+        ((used_gprs.len() + 1) / 2) * 16
+    }
+    fn pop_used_gpr(&mut self) {
+        let used_gprs = self.get_used_gprs();
+        for r in used_gprs.iter().rev() {
+            self.emit_pop(Size::S64, Location::GPR(*r));
+        }
+        if used_gprs.len() % 2 == 1 {
+            self.emit_pop(Size::S64, Location::GPR(GPR::XzrSp));
+        }
+    }
+
+    // Picks an unused NEON register.
+    fn pick_simd(&self) -> Option<NEON> {
+        use NEON::*;
+        static REGS: &[NEON] = &[V8, V9, V10, V11, V12];
+        for r in REGS {
+            if !self.used_simd.contains(r) {
+                return Some(*r);
+            }
+        }
+        None
+    }
+
+    // Picks an unused NEON register for internal temporary use.
+    fn pick_temp_simd(&self) -> Option<NEON> {
+        use NEON::*;
+        static REGS: &[NEON] = &[V0, V1, V2, V3, V4, V5, V6, V7];
+        for r in REGS {
+            if !self.used_simd.contains(r) {
+                return Some(*r);
+            }
+        }
+        None
+    }
+
+    // Acquires a temporary NEON register.
+    fn acquire_temp_simd(&mut self) -> Option<NEON> {
+        let simd = self.pick_temp_simd();
+        if let Some(x) = simd {
+            self.used_simd.insert(x);
+        }
+        simd
+    }
+
+    fn reserve_simd(&mut self, simd: NEON) {
+        self.used_simd.insert(simd);
+    }
+
+    // Releases a temporary NEON register.
+    fn release_simd(&mut self, simd: NEON) {
+        assert_eq!(self.used_simd.remove(&simd), true);
+    }
+
+    fn push_used_simd(&mut self) -> usize {
+        let used_neons = self.get_used_simd();
+        let stack_adjust = if used_neons.len() & 1 == 1 {
+            (used_neons.len() * 8) as u32 + 8
+        } else {
+            (used_neons.len() * 8) as u32
+        };
+        self.adjust_stack(stack_adjust);
+
+        for (i, r) in used_neons.iter().enumerate() {
+            self.assembler.emit_str(
+                Size::S64,
+                Location::SIMD(*r),
+                Location::Memory(GPR::XzrSp, (i * 8) as i32),
+            );
+        }
+        stack_adjust as usize
+    }
+    fn pop_used_simd(&mut self) {
+        let used_neons = self.get_used_simd();
+        for (i, r) in used_neons.iter().enumerate() {
+            self.assembler.emit_ldr(
+                Size::S64,
+                Location::SIMD(*r),
+                Location::Memory(GPR::XzrSp, (i * 8) as i32),
+            );
+        }
+        let stack_adjust = if used_neons.len() & 1 == 1 {
+            (used_neons.len() * 8) as u32 + 8
+        } else {
+            (used_neons.len() * 8) as u32
+        };
+        self.assembler.emit_add(
+            Size::S64,
+            Location::GPR(GPR::XzrSp),
+            Location::Imm32(stack_adjust as _),
+            Location::GPR(GPR::XzrSp),
+        );
+    }
+
+    /// Set the source location of the Wasm to the given offset.
+    fn set_srcloc(&mut self, offset: u32) {
+        self.src_loc = offset;
+    }
+    /// Marks each address in the code range emitted by `f` with the trap code `code`.
+    fn mark_address_range_with_trap_code(&mut self, code: TrapCode, begin: usize, end: usize) {
+        for i in begin..end {
+            self.trap_table.offset_to_code.insert(i, code);
+        }
+        self.mark_instruction_address_end(begin);
+    }
+
+    /// Marks one address as trappable with trap code `code`.
+    fn mark_address_with_trap_code(&mut self, code: TrapCode) {
+        let offset = self.assembler.get_offset().0;
+        self.trap_table.offset_to_code.insert(offset, code);
+        self.mark_instruction_address_end(offset);
+    }
+    /// Marks the instruction as trappable with trap code `code`. return "begin" offset
+    fn mark_instruction_with_trap_code(&mut self, code: TrapCode) -> usize {
+        let offset = self.assembler.get_offset().0;
+        self.trap_table.offset_to_code.insert(offset, code);
+        offset
+    }
+    /// Pushes the instruction to the address map, calculating the offset from a
+    /// provided beginning address.
+    fn mark_instruction_address_end(&mut self, begin: usize) {
+        self.instructions_address_map.push(InstructionAddressMap {
+            srcloc: SourceLoc::new(self.src_loc),
+            code_offset: begin,
+            code_len: self.assembler.get_offset().0 - begin,
+        });
+    }
+
+    /// Insert a StackOverflow (at offset 0)
+    fn insert_stackoverflow(&mut self) {
+        let offset = 0;
+        self.trap_table
+            .offset_to_code
+            .insert(offset, TrapCode::StackOverflow);
+        self.mark_instruction_address_end(offset);
+    }
+
+    /// Get all current TrapInformation
+    fn collect_trap_information(&self) -> Vec<TrapInformation> {
+        self.trap_table
+            .offset_to_code
+            .clone()
+            .into_iter()
+            .map(|(offset, code)| TrapInformation {
+                code_offset: offset as u32,
+                trap_code: code,
+            })
+            .collect()
+    }
+
+    fn instructions_address_map(&self) -> Vec<InstructionAddressMap> {
+        self.instructions_address_map.clone()
+    }
+
+    // Return a rounded stack adjustement value (must be multiple of 16bytes on ARM64 for example)
+    fn round_stack_adjust(&self, value: usize) -> usize {
+        if value & 0xf != 0 {
+            ((value >> 4) + 1) << 4
+        } else {
+            value
+        }
+    }
+
+    // Memory location for a local on the stack
+    fn local_on_stack(&mut self, stack_offset: i32) -> Location {
+        Location::Memory(GPR::X29, -stack_offset)
+    }
+
+    // Adjust stack for locals
+    fn adjust_stack(&mut self, delta_stack_offset: u32) {
+        let delta = if self.compatible_imm(delta_stack_offset as _, ImmType::Bits12) {
+            Location::Imm32(delta_stack_offset as _)
+        } else {
+            let tmp = GPR::X17;
+            self.assembler
+                .emit_mov_imm(Location::GPR(tmp), delta_stack_offset as u64);
+            Location::GPR(tmp)
+        };
+        self.assembler.emit_sub(
+            Size::S64,
+            Location::GPR(GPR::XzrSp),
+            delta,
+            Location::GPR(GPR::XzrSp),
+        );
+    }
+    // restore stack
+    fn restore_stack(&mut self, delta_stack_offset: u32) {
+        let delta = if self.compatible_imm(delta_stack_offset as _, ImmType::Bits12) {
+            Location::Imm32(delta_stack_offset as _)
+        } else {
+            let tmp = GPR::X17;
+            self.assembler
+                .emit_mov_imm(Location::GPR(tmp), delta_stack_offset as u64);
+            Location::GPR(tmp)
+        };
+        self.assembler.emit_add(
+            Size::S64,
+            Location::GPR(GPR::XzrSp),
+            delta,
+            Location::GPR(GPR::XzrSp),
+        );
+    }
+    fn pop_stack_locals(&mut self, delta_stack_offset: u32) {
+        let real_delta = if delta_stack_offset & 15 != 0 {
+            delta_stack_offset + 8
+        } else {
+            delta_stack_offset
+        };
+        let delta = if self.compatible_imm(real_delta as i64, ImmType::Bits12) {
+            Location::Imm32(real_delta as _)
+        } else {
+            let tmp = GPR::X17;
+            self.assembler
+                .emit_mov_imm(Location::GPR(tmp), real_delta as u64);
+            Location::GPR(tmp)
+        };
+        self.assembler.emit_add(
+            Size::S64,
+            Location::GPR(GPR::XzrSp),
+            delta,
+            Location::GPR(GPR::XzrSp),
+        );
+    }
+    // push a value on the stack for a native call
+    fn push_location_for_native(&mut self, loc: Location) {
+        match loc {
+            Location::Imm64(_) => {
+                self.move_location(Size::S64, loc, Location::GPR(GPR::X17));
+                self.emit_push(Size::S64, Location::GPR(GPR::X17));
+            }
+            _ => self.emit_push(Size::S64, loc),
+        }
+    }
+
+    // Zero a location that is 32bits
+    fn zero_location(&mut self, size: Size, location: Location) {
+        self.move_location(size, Location::GPR(GPR::XzrSp), location);
+    }
+
+    // GPR Reg used for local pointer on the stack
+    fn local_pointer(&self) -> GPR {
+        GPR::X29
+    }
+
+    // Determine whether a local should be allocated on the stack.
+    fn is_local_on_stack(&self, idx: usize) -> bool {
+        idx > 7
+    }
+
+    // Determine a local's location.
+    fn get_local_location(&self, idx: usize, callee_saved_regs_size: usize) -> Location {
+        // Use callee-saved registers for the first locals.
+        match idx {
+            0 => Location::GPR(GPR::X19),
+            1 => Location::GPR(GPR::X20),
+            2 => Location::GPR(GPR::X21),
+            3 => Location::GPR(GPR::X22),
+            4 => Location::GPR(GPR::X23),
+            5 => Location::GPR(GPR::X24),
+            6 => Location::GPR(GPR::X25),
+            7 => Location::GPR(GPR::X26),
+            _ => Location::Memory(GPR::X29, -(((idx - 7) * 8 + callee_saved_regs_size) as i32)),
+        }
+    }
+    // Move a local to the stack
+    fn move_local(&mut self, stack_offset: i32, location: Location) {
+        if stack_offset < 256 {
+            self.assembler
+                .emit_stur(Size::S64, location, GPR::X29, -stack_offset);
+        } else {
+            let tmp = GPR::X17;
+            self.assembler
+                .emit_mov_imm(Location::GPR(tmp), (stack_offset as i64) as u64);
+            self.assembler.emit_sub(
+                Size::S64,
+                Location::GPR(GPR::X29),
+                Location::GPR(tmp),
+                Location::GPR(tmp),
+            );
+            self.assembler
+                .emit_str(Size::S64, location, Location::GPR(tmp));
+        }
+    }
+
+    // List of register to save, depending on the CallingConvention
+    fn list_to_save(&self, _calling_convention: CallingConvention) -> Vec<Location> {
+        vec![]
+    }
+
+    // Get param location, MUST be called in order!
+    fn get_param_location(
+        &self,
+        idx: usize,
+        sz: Size,
+        stack_args: &mut usize,
+        calling_convention: CallingConvention,
+    ) -> Location {
+        match calling_convention {
+            CallingConvention::AppleAarch64 => match idx {
+                0 => Location::GPR(GPR::X0),
+                1 => Location::GPR(GPR::X1),
+                2 => Location::GPR(GPR::X2),
+                3 => Location::GPR(GPR::X3),
+                4 => Location::GPR(GPR::X4),
+                5 => Location::GPR(GPR::X5),
+                6 => Location::GPR(GPR::X6),
+                7 => Location::GPR(GPR::X7),
+                _ => {
+                    let sz = match sz {
+                        Size::S8 => 0,
+                        Size::S16 => 1,
+                        Size::S32 => 2,
+                        Size::S64 => 3,
+                    };
+                    // align first
+                    if sz > 1 {
+                        if *stack_args & !((1 << sz) - 1) != 0 {
+                            *stack_args = (*stack_args + ((1 << sz) - 1)) & !((1 << sz) - 1);
+                        }
+                    }
+                    let loc = Location::Memory(GPR::XzrSp, *stack_args as i32);
+                    *stack_args += 1 << sz;
+                    loc
+                }
+            },
+            _ => match idx {
+                0 => Location::GPR(GPR::X0),
+                1 => Location::GPR(GPR::X1),
+                2 => Location::GPR(GPR::X2),
+                3 => Location::GPR(GPR::X3),
+                4 => Location::GPR(GPR::X4),
+                5 => Location::GPR(GPR::X5),
+                6 => Location::GPR(GPR::X6),
+                7 => Location::GPR(GPR::X7),
+                _ => {
+                    let loc = Location::Memory(GPR::XzrSp, *stack_args as i32);
+                    *stack_args += 8;
+                    loc
+                }
+            },
+        }
+    }
+    // Get call param location, MUST be called in order!
+    fn get_call_param_location(
+        &self,
+        idx: usize,
+        sz: Size,
+        stack_args: &mut usize,
+        calling_convention: CallingConvention,
+    ) -> Location {
+        match calling_convention {
+            CallingConvention::AppleAarch64 => match idx {
+                0 => Location::GPR(GPR::X0),
+                1 => Location::GPR(GPR::X1),
+                2 => Location::GPR(GPR::X2),
+                3 => Location::GPR(GPR::X3),
+                4 => Location::GPR(GPR::X4),
+                5 => Location::GPR(GPR::X5),
+                6 => Location::GPR(GPR::X6),
+                7 => Location::GPR(GPR::X7),
+                _ => {
+                    let sz = match sz {
+                        Size::S8 => 0,
+                        Size::S16 => 1,
+                        Size::S32 => 2,
+                        Size::S64 => 3,
+                    };
+                    // align first
+                    if sz > 1 {
+                        if *stack_args & !((1 << sz) - 1) != 0 {
+                            *stack_args = (*stack_args + ((1 << sz) - 1)) & !((1 << sz) - 1);
+                        }
+                    }
+                    let loc = Location::Memory(GPR::X29, 16 * 2 + *stack_args as i32);
+                    *stack_args += 1 << sz;
+                    loc
+                }
+            },
+            _ => match idx {
+                0 => Location::GPR(GPR::X0),
+                1 => Location::GPR(GPR::X1),
+                2 => Location::GPR(GPR::X2),
+                3 => Location::GPR(GPR::X3),
+                4 => Location::GPR(GPR::X4),
+                5 => Location::GPR(GPR::X5),
+                6 => Location::GPR(GPR::X6),
+                7 => Location::GPR(GPR::X7),
+                _ => {
+                    let loc = Location::Memory(GPR::X29, 16 * 2 + *stack_args as i32);
+                    *stack_args += 8;
+                    loc
+                }
+            },
+        }
+    }
+    // Get simple param location, Will not be accurate for Apple calling convention on "stack" arguments
+    fn get_simple_param_location(
+        &self,
+        idx: usize,
+        calling_convention: CallingConvention,
+    ) -> Location {
+        match calling_convention {
+            _ => match idx {
+                0 => Location::GPR(GPR::X0),
+                1 => Location::GPR(GPR::X1),
+                2 => Location::GPR(GPR::X2),
+                3 => Location::GPR(GPR::X3),
+                4 => Location::GPR(GPR::X4),
+                5 => Location::GPR(GPR::X5),
+                6 => Location::GPR(GPR::X6),
+                7 => Location::GPR(GPR::X7),
+                _ => Location::Memory(GPR::X29, (16 * 2 + (idx - 8) * 8) as i32),
+            },
+        }
+    }
+    // move a location to another
+    fn move_location(&mut self, size: Size, source: Location, dest: Location) {
+        match source {
+            Location::GPR(_) | Location::SIMD(_) => match dest {
+                Location::GPR(_) | Location::SIMD(_) => self.assembler.emit_mov(size, source, dest),
+                Location::Memory(addr, offs) => {
+                    if self.offset_is_ok(size, offs) {
+                        self.assembler.emit_str(size, source, dest);
+                    } else if self.compatible_imm(offs as i64, ImmType::UnscaledOffset) {
+                        self.assembler.emit_stur(size, source, addr, offs);
+                    } else {
+                        let tmp = GPR::X17;
+                        if offs < 0 {
+                            self.assembler
+                                .emit_mov_imm(Location::GPR(tmp), (-offs) as u64);
+                            self.assembler.emit_sub(
+                                Size::S64,
+                                Location::GPR(addr),
+                                Location::GPR(tmp),
+                                Location::GPR(tmp),
+                            );
+                        } else {
+                            self.assembler.emit_mov_imm(Location::GPR(tmp), offs as u64);
+                            self.assembler.emit_add(
+                                Size::S64,
+                                Location::GPR(addr),
+                                Location::GPR(tmp),
+                                Location::GPR(tmp),
+                            );
+                        }
+                        self.assembler
+                            .emit_str(size, source, Location::Memory(tmp, 0));
+                    }
+                }
+                _ => panic!(
+                    "singlepass can't emit move_location {:?} {:?} => {:?}",
+                    size, source, dest
+                ),
+            },
+            Location::Imm8(_) => match dest {
+                Location::GPR(_) => self.assembler.emit_mov(size, source, dest),
+                Location::Memory(_, _) => match size {
+                    Size::S64 => self.emit_relaxed_str64(source, dest),
+                    Size::S32 => self.emit_relaxed_str32(source, dest),
+                    Size::S16 => self.emit_relaxed_str16(source, dest),
+                    Size::S8 => self.emit_relaxed_str8(source, dest),
+                },
+                _ => panic!(
+                    "singlepass can't emit move_location {:?} {:?} => {:?}",
+                    size, source, dest
+                ),
+            },
+            Location::Imm32(val) => match dest {
+                Location::GPR(_) => self.assembler.emit_mov_imm(dest, val as u64),
+                Location::Memory(_, _) => match size {
+                    Size::S64 => self.emit_relaxed_str64(source, dest),
+                    Size::S32 => self.emit_relaxed_str32(source, dest),
+                    Size::S16 => self.emit_relaxed_str16(source, dest),
+                    Size::S8 => self.emit_relaxed_str8(source, dest),
+                },
+                _ => panic!(
+                    "singlepass can't emit move_location {:?} {:?} => {:?}",
+                    size, source, dest
+                ),
+            },
+            Location::Imm64(val) => match dest {
+                Location::GPR(_) => self.assembler.emit_mov_imm(dest, val),
+                Location::Memory(_, _) => match size {
+                    Size::S64 => self.emit_relaxed_str64(source, dest),
+                    Size::S32 => self.emit_relaxed_str32(source, dest),
+                    Size::S16 => self.emit_relaxed_str16(source, dest),
+                    Size::S8 => self.emit_relaxed_str8(source, dest),
+                },
+                _ => panic!(
+                    "singlepass can't emit move_location {:?} {:?} => {:?}",
+                    size, source, dest
+                ),
+            },
+            Location::Memory(addr, offs) => match dest {
+                Location::GPR(_) | Location::SIMD(_) => {
+                    if self.offset_is_ok(size, offs) {
+                        self.assembler.emit_ldr(size, dest, source);
+                    } else if offs > -256 && offs < 256 {
+                        self.assembler.emit_ldur(size, dest, addr, offs);
+                    } else {
+                        let tmp = GPR::X17;
+                        if offs < 0 {
+                            self.assembler
+                                .emit_mov_imm(Location::GPR(tmp), (-offs) as u64);
+                            self.assembler.emit_sub(
+                                Size::S64,
+                                Location::GPR(addr),
+                                Location::GPR(tmp),
+                                Location::GPR(tmp),
+                            );
+                        } else {
+                            self.assembler.emit_mov_imm(Location::GPR(tmp), offs as u64);
+                            self.assembler.emit_add(
+                                Size::S64,
+                                Location::GPR(addr),
+                                Location::GPR(tmp),
+                                Location::GPR(tmp),
+                            );
+                        }
+                        self.assembler
+                            .emit_ldr(size, dest, Location::Memory(tmp, 0));
+                    }
+                }
+                _ => {
+                    let mut temps = vec![];
+                    let src =
+                        self.location_to_reg(size, source, &mut temps, ImmType::None, true, None);
+                    self.move_location(size, src, dest);
+                    for r in temps {
+                        self.release_gpr(r);
+                    }
+                }
+            },
+            _ => panic!(
+                "singlepass can't emit move_location {:?} {:?} => {:?}",
+                size, source, dest
+            ),
+        }
+    }
+    // move a location to another
+    fn move_location_extend(
+        &mut self,
+        size_val: Size,
+        signed: bool,
+        source: Location,
+        size_op: Size,
+        dest: Location,
+    ) {
+        if size_op != Size::S64 {
+            unreachable!();
+        }
+        let mut temps = vec![];
+        let dst = self.location_to_reg(size_op, dest, &mut temps, ImmType::None, false, None);
+        let src = match (size_val, signed, source) {
+            (Size::S64, _, _) => source,
+            (Size::S32, false, Location::GPR(_)) => {
+                self.assembler.emit_mov(size_val, source, dst);
+                dst
+            }
+            (Size::S32, true, Location::GPR(_)) => {
+                self.assembler.emit_sxtw(size_val, source, dst);
+                dst
+            }
+            (Size::S32, false, Location::Memory(_, _)) => {
+                self.emit_relaxed_ldr32(size_op, dst, source);
+                dst
+            }
+            (Size::S32, true, Location::Memory(_, _)) => {
+                self.emit_relaxed_ldr32s(size_op, dst, source);
+                dst
+            }
+            _ => panic!(
+                "singlepass can't emit move_location_extend {:?} {:?} {:?} => {:?} {:?}",
+                size_val, signed, source, size_op, dest
+            ),
+        };
+        if src != dst {
+            self.move_location(size_op, src, dst);
+        }
+        if dst != dest {
+            self.move_location(size_op, dst, dest);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn load_address(&mut self, _size: Size, _reg: Location, _mem: Location) {
+        unimplemented!();
+    }
+    // Init the stack loc counter
+    fn init_stack_loc(&mut self, init_stack_loc_cnt: u64, last_stack_loc: Location) {
+        let label = self.assembler.get_label();
+        let mut temps = vec![];
+        let dest = self.acquire_temp_gpr().unwrap();
+        temps.push(dest.clone());
+        let cnt = self.location_to_reg(
+            Size::S64,
+            Location::Imm64(init_stack_loc_cnt),
+            &mut temps,
+            ImmType::None,
+            true,
+            None,
+        );
+        let dest = match last_stack_loc {
+            Location::GPR(_) => unreachable!(),
+            Location::SIMD(_) => unreachable!(),
+            Location::Memory(reg, offset) => {
+                if offset < 0 {
+                    let offset = (-offset) as u32;
+                    if self.compatible_imm(offset as i64, ImmType::Bits12) {
+                        self.assembler.emit_sub(
+                            Size::S64,
+                            Location::GPR(reg),
+                            Location::Imm32(offset),
+                            Location::GPR(dest),
+                        );
+                    } else {
+                        let tmp = self.acquire_temp_gpr().unwrap();
+                        self.assembler
+                            .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                        self.assembler.emit_sub(
+                            Size::S64,
+                            Location::GPR(reg),
+                            Location::GPR(tmp),
+                            Location::GPR(dest),
+                        );
+                        temps.push(tmp);
+                    }
+                    dest
+                } else {
+                    let offset = offset as u32;
+                    if self.compatible_imm(offset as i64, ImmType::Bits12) {
+                        self.assembler.emit_add(
+                            Size::S64,
+                            Location::GPR(reg),
+                            Location::Imm32(offset),
+                            Location::GPR(dest),
+                        );
+                    } else {
+                        let tmp = self.acquire_temp_gpr().unwrap();
+                        self.assembler
+                            .emit_mov_imm(Location::GPR(tmp), (offset as i64) as u64);
+                        self.assembler.emit_add(
+                            Size::S64,
+                            Location::GPR(reg),
+                            Location::GPR(tmp),
+                            Location::GPR(dest),
+                        );
+                        temps.push(tmp);
+                    }
+                    dest
+                }
+            }
+            _ => panic!("singlepass can't emit init_stack_loc {:?}", last_stack_loc),
+        };
+        self.assembler.emit_label(label);
+        self.assembler
+            .emit_stria(Size::S64, Location::GPR(GPR::XzrSp), dest, 8);
+        self.assembler
+            .emit_sub(Size::S64, cnt, Location::Imm8(1), cnt);
+        self.assembler.emit_cbnz_label(Size::S64, cnt, label);
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    // Restore save_area
+    fn restore_saved_area(&mut self, saved_area_offset: i32) {
+        let real_delta = if saved_area_offset & 15 != 0 {
+            self.pushed = true;
+            saved_area_offset + 8
+        } else {
+            self.pushed = false;
+            saved_area_offset
+        };
+        if self.compatible_imm(real_delta as _, ImmType::Bits12) {
+            self.assembler.emit_sub(
+                Size::S64,
+                Location::GPR(GPR::X29),
+                Location::Imm32(real_delta as _),
+                Location::GPR(GPR::XzrSp),
+            );
+        } else {
+            let tmp = self.acquire_temp_gpr().unwrap();
+            self.assembler
+                .emit_mov_imm(Location::GPR(tmp), real_delta as u64);
+            self.assembler.emit_sub(
+                Size::S64,
+                Location::GPR(GPR::X29),
+                Location::GPR(tmp),
+                Location::GPR(GPR::XzrSp),
+            );
+            self.release_gpr(tmp);
+        }
+    }
+    // Pop a location
+    fn pop_location(&mut self, location: Location) {
+        self.emit_pop(Size::S64, location);
+    }
+    // Create a new `MachineState` with default values.
+    fn new_machine_state(&self) -> MachineState {
+        new_machine_state()
+    }
+
+    // assembler finalize
+    fn assembler_finalize(self) -> Vec<u8> {
+        self.assembler.finalize().unwrap()
+    }
+
+    fn get_offset(&self) -> Offset {
+        self.assembler.get_offset()
+    }
+
+    fn finalize_function(&mut self) {
+        self.assembler.finalize_function();
+    }
+
+    fn emit_function_prolog(&mut self) {
+        self.emit_double_push(Size::S64, Location::GPR(GPR::X29), Location::GPR(GPR::X30)); // save LR too
+        self.emit_double_push(Size::S64, Location::GPR(GPR::X27), Location::GPR(GPR::X28));
+        // cannot use mov, because XSP is XZR there. Need to use ADD with #0
+        self.assembler.emit_add(
+            Size::S64,
+            Location::GPR(GPR::XzrSp),
+            Location::Imm8(0),
+            Location::GPR(GPR::X29),
+        );
+    }
+
+    fn emit_function_epilog(&mut self) {
+        // cannot use mov, because XSP is XZR there. Need to use ADD with #0
+        self.assembler.emit_add(
+            Size::S64,
+            Location::GPR(GPR::X29),
+            Location::Imm8(0),
+            Location::GPR(GPR::XzrSp),
+        );
+        self.pushed = false; // SP is restored, consider it aligned
+        self.emit_double_pop(Size::S64, Location::GPR(GPR::X27), Location::GPR(GPR::X28));
+        self.emit_double_pop(Size::S64, Location::GPR(GPR::X29), Location::GPR(GPR::X30));
+    }
+
+    fn emit_function_return_value(&mut self, ty: WpType, canonicalize: bool, loc: Location) {
+        if canonicalize {
+            self.canonicalize_nan(
+                match ty {
+                    WpType::F32 => Size::S32,
+                    WpType::F64 => Size::S64,
+                    _ => unreachable!(),
+                },
+                loc,
+                Location::GPR(GPR::X0),
+            );
+        } else {
+            self.emit_relaxed_mov(Size::S64, loc, Location::GPR(GPR::X0));
+        }
+    }
+
+    fn emit_function_return_float(&mut self) {
+        self.assembler
+            .emit_mov(Size::S64, Location::GPR(GPR::X0), Location::SIMD(NEON::V0));
+    }
+
+    fn arch_supports_canonicalize_nan(&self) -> bool {
+        self.assembler.arch_supports_canonicalize_nan()
+    }
+    fn canonicalize_nan(&mut self, sz: Size, input: Location, output: Location) {
+        let mut tempn = vec![];
+        let mut temps = vec![];
+        let old_fpcr = self.set_default_nan(&mut temps);
+        // use FMAX (input, intput) => output to automaticaly normalize the NaN
+        match (sz, input, output) {
+            (Size::S32, Location::SIMD(_), Location::SIMD(_)) => {
+                self.assembler.emit_fmax(sz, input, input, output);
+            }
+            (Size::S64, Location::SIMD(_), Location::SIMD(_)) => {
+                self.assembler.emit_fmax(sz, input, input, output);
+            }
+            (Size::S32, Location::SIMD(_), _) | (Size::S64, Location::SIMD(_), _) => {
+                let tmp = self.location_to_neon(sz, output, &mut tempn, ImmType::None, false);
+                self.assembler.emit_fmax(sz, input, input, tmp);
+                self.move_location(sz, tmp, output);
+            }
+            _ => panic!(
+                "singlepass can't emit canonicalize_nan {:?} {:?} {:?}",
+                sz, input, output
+            ),
+        }
+
+        self.restore_fpcr(old_fpcr);
+        for r in temps {
+            self.release_gpr(r);
+        }
+        for r in tempn {
+            self.release_simd(r);
+        }
+    }
+
+    fn emit_illegal_op(&mut self) {
+        self.assembler.emit_udf();
+    }
+    fn get_label(&mut self) -> Label {
+        self.assembler.new_dynamic_label()
+    }
+    fn emit_label(&mut self, label: Label) {
+        self.assembler.emit_label(label);
+    }
+    fn get_grp_for_call(&self) -> GPR {
+        GPR::X27
+    }
+    fn emit_call_register(&mut self, reg: GPR) {
+        self.assembler.emit_call_register(reg);
+    }
+    fn emit_call_label(&mut self, label: Label) {
+        self.assembler.emit_call_label(label);
+    }
+    fn get_gpr_for_ret(&self) -> GPR {
+        GPR::X0
+    }
+    fn get_simd_for_ret(&self) -> NEON {
+        NEON::V0
+    }
+
+    fn arch_requires_indirect_call_trampoline(&self) -> bool {
+        self.assembler.arch_requires_indirect_call_trampoline()
+    }
+
+    fn arch_emit_indirect_call_with_trampoline(&mut self, location: Location) {
+        self.assembler
+            .arch_emit_indirect_call_with_trampoline(location);
+    }
+
+    fn emit_debug_breakpoint(&mut self) {
+        self.assembler.emit_brk();
+    }
+
+    fn emit_call_location(&mut self, location: Location) {
+        let mut temps = vec![];
+        let loc = self.location_to_reg(
+            Size::S64,
+            location,
+            &mut temps,
+            ImmType::None,
+            true,
+            Some(GPR::X27),
+        );
+        match loc {
+            Location::GPR(reg) => self.assembler.emit_call_register(reg),
+            _ => unreachable!(),
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+
+    fn location_address(&mut self, _size: Size, _source: Location, _dest: Location) {
+        unimplemented!();
+    }
+    // logic
+    fn location_and(&mut self, _size: Size, _source: Location, _dest: Location, _flags: bool) {
+        unimplemented!();
+    }
+    fn location_xor(&mut self, _size: Size, _source: Location, _dest: Location, _flags: bool) {
+        unimplemented!();
+    }
+    fn location_or(&mut self, _size: Size, _source: Location, _dest: Location, _flags: bool) {
+        unimplemented!();
+    }
+    fn location_test(&mut self, _size: Size, _source: Location, _dest: Location) {
+        unimplemented!();
+    }
+    // math
+    fn location_add(&mut self, size: Size, source: Location, dest: Location, flags: bool) {
+        let mut temps = vec![];
+        let src = self.location_to_reg(size, source, &mut temps, ImmType::Bits12, true, None);
+        let dst = self.location_to_reg(size, dest, &mut temps, ImmType::None, true, None);
+        if flags {
+            self.assembler.emit_adds(size, dst, src, dst);
+        } else {
+            self.assembler.emit_add(size, dst, src, dst);
+        }
+        if dst != dest {
+            self.move_location(size, dst, dest);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn location_sub(&mut self, size: Size, source: Location, dest: Location, flags: bool) {
+        let mut temps = vec![];
+        let src = self.location_to_reg(size, source, &mut temps, ImmType::Bits12, true, None);
+        let dst = self.location_to_reg(size, dest, &mut temps, ImmType::None, true, None);
+        if flags {
+            self.assembler.emit_subs(size, dst, src, dst);
+        } else {
+            self.assembler.emit_sub(size, dst, src, dst);
+        }
+        if dst != dest {
+            self.move_location(size, dst, dest);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn location_cmp(&mut self, size: Size, source: Location, dest: Location) {
+        self.emit_relaxed_binop(Assembler::emit_cmp, size, source, dest, false);
+    }
+    fn jmp_unconditionnal(&mut self, label: Label) {
+        self.assembler.emit_b_label(label);
+    }
+    fn jmp_on_equal(&mut self, label: Label) {
+        self.assembler.emit_bcond_label(Condition::Eq, label);
+    }
+    fn jmp_on_different(&mut self, label: Label) {
+        self.assembler.emit_bcond_label(Condition::Ne, label);
+    }
+    fn jmp_on_above(&mut self, label: Label) {
+        self.assembler.emit_bcond_label(Condition::Hi, label);
+    }
+    fn jmp_on_aboveequal(&mut self, label: Label) {
+        self.assembler.emit_bcond_label(Condition::Cs, label);
+    }
+    fn jmp_on_belowequal(&mut self, label: Label) {
+        self.assembler.emit_bcond_label(Condition::Ls, label);
+    }
+    fn jmp_on_overflow(&mut self, label: Label) {
+        self.assembler.emit_bcond_label(Condition::Cs, label);
+    }
+
+    // jmp table
+    fn emit_jmp_to_jumptable(&mut self, label: Label, cond: Location) {
+        let tmp1 = self.acquire_temp_gpr().unwrap();
+        let tmp2 = self.acquire_temp_gpr().unwrap();
+
+        self.assembler.emit_load_label(tmp1, label);
+        self.move_location(Size::S32, cond, Location::GPR(tmp2));
+
+        self.assembler.emit_add_lsl(
+            Size::S64,
+            Location::GPR(tmp1),
+            Location::GPR(tmp2),
+            2,
+            Location::GPR(tmp2),
+        );
+        self.assembler.emit_b_register(tmp2);
+        self.release_gpr(tmp2);
+        self.release_gpr(tmp1);
+    }
+
+    fn align_for_loop(&mut self) {
+        // noting to do on ARM64
+    }
+
+    fn emit_ret(&mut self) {
+        self.assembler.emit_ret();
+    }
+
+    fn emit_push(&mut self, size: Size, loc: Location) {
+        self.emit_push(size, loc);
+    }
+    fn emit_pop(&mut self, size: Size, loc: Location) {
+        self.emit_pop(size, loc);
+    }
+
+    fn emit_memory_fence(&mut self) {
+        self.assembler.emit_dmb();
+    }
+
+    fn location_neg(
+        &mut self,
+        _size_val: Size, // size of src
+        _signed: bool,
+        _source: Location,
+        _size_op: Size,
+        _dest: Location,
+    ) {
+        unimplemented!();
+    }
+
+    fn emit_imul_imm32(&mut self, size: Size, imm32: u32, gpr: GPR) {
+        let tmp = self.acquire_temp_gpr().unwrap();
+        self.assembler
+            .emit_mov_imm(Location::GPR(tmp), imm32 as u64);
+        self.assembler.emit_mul(
+            size,
+            Location::GPR(gpr),
+            Location::GPR(tmp),
+            Location::GPR(gpr),
+        );
+        self.release_gpr(tmp);
+    }
+
+    // relaxed binop based...
+    fn emit_relaxed_mov(&mut self, sz: Size, src: Location, dst: Location) {
+        self.emit_relaxed_binop(Assembler::emit_mov, sz, src, dst, true);
+    }
+    fn emit_relaxed_cmp(&mut self, sz: Size, src: Location, dst: Location) {
+        self.emit_relaxed_binop(Assembler::emit_cmp, sz, src, dst, false);
+    }
+    fn emit_relaxed_zero_extension(
+        &mut self,
+        _sz_src: Size,
+        _src: Location,
+        _sz_dst: Size,
+        _dst: Location,
+    ) {
+        unimplemented!();
+    }
+    fn emit_relaxed_sign_extension(
+        &mut self,
+        sz_src: Size,
+        src: Location,
+        sz_dst: Size,
+        dst: Location,
+    ) {
+        match (src, dst) {
+            (Location::Memory(_, _), Location::GPR(_)) => match sz_src {
+                Size::S8 => self.emit_relaxed_ldr8s(sz_dst, dst, src),
+                Size::S16 => self.emit_relaxed_ldr16s(sz_dst, dst, src),
+                Size::S32 => self.emit_relaxed_ldr32s(sz_dst, dst, src),
+                _ => unreachable!(),
+            },
+            _ => {
+                let mut temps = vec![];
+                let src = self.location_to_reg(sz_src, src, &mut temps, ImmType::None, true, None);
+                let dest =
+                    self.location_to_reg(sz_dst, dst, &mut temps, ImmType::None, false, None);
+                match sz_src {
+                    Size::S8 => self.assembler.emit_sxtb(sz_dst, src, dest),
+                    Size::S16 => self.assembler.emit_sxth(sz_dst, src, dest),
+                    Size::S32 => self.assembler.emit_sxtw(sz_dst, src, dest),
+                    _ => unreachable!(),
+                };
+                if dst != dest {
+                    self.move_location(sz_dst, dest, dst);
+                }
+                for r in temps {
+                    self.release_gpr(r);
+                }
+            }
+        }
+    }
+
+    fn emit_binop_add32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_add,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Bits12,
+        );
+    }
+    fn emit_binop_sub32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_sub,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Bits12,
+        );
+    }
+    fn emit_binop_mul32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_mul,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+    }
+    fn emit_binop_udiv32(
+        &mut self,
+        loc_a: Location,
+        loc_b: Location,
+        ret: Location,
+        integer_division_by_zero: Label,
+        _integer_overflow: Label,
+    ) -> usize {
+        let mut temps = vec![];
+        let src1 = self.location_to_reg(Size::S32, loc_a, &mut temps, ImmType::None, true, None);
+        let src2 = self.location_to_reg(Size::S32, loc_b, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+
+        self.assembler
+            .emit_cbz_label(Size::S32, src2, integer_division_by_zero);
+        let offset = self.mark_instruction_with_trap_code(TrapCode::IntegerOverflow);
+        self.assembler.emit_udiv(Size::S32, src1, src2, dest);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+        offset
+    }
+    fn emit_binop_sdiv32(
+        &mut self,
+        loc_a: Location,
+        loc_b: Location,
+        ret: Location,
+        integer_division_by_zero: Label,
+        integer_overflow: Label,
+    ) -> usize {
+        let mut temps = vec![];
+        let src1 = self.location_to_reg(Size::S32, loc_a, &mut temps, ImmType::None, true, None);
+        let src2 = self.location_to_reg(Size::S32, loc_b, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+
+        self.assembler
+            .emit_cbz_label(Size::S32, src2, integer_division_by_zero);
+        let label_nooverflow = self.assembler.get_label();
+        let tmp = self.location_to_reg(
+            Size::S32,
+            Location::Imm32(0x80000000),
+            &mut temps,
+            ImmType::None,
+            true,
+            None,
+        );
+        self.assembler.emit_cmp(Size::S32, tmp, src1);
+        self.assembler
+            .emit_bcond_label(Condition::Ne, label_nooverflow);
+        self.assembler.emit_movn(Size::S32, tmp, 0);
+        self.assembler.emit_cmp(Size::S32, tmp, src2);
+        self.assembler
+            .emit_bcond_label(Condition::Eq, integer_overflow);
+        let offset = self.mark_instruction_with_trap_code(TrapCode::IntegerOverflow);
+        self.assembler.emit_label(label_nooverflow);
+        self.assembler.emit_sdiv(Size::S32, src1, src2, dest);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+        offset
+    }
+    fn emit_binop_urem32(
+        &mut self,
+        loc_a: Location,
+        loc_b: Location,
+        ret: Location,
+        integer_division_by_zero: Label,
+        _integer_overflow: Label,
+    ) -> usize {
+        let mut temps = vec![];
+        let src1 = self.location_to_reg(Size::S32, loc_a, &mut temps, ImmType::None, true, None);
+        let src2 = self.location_to_reg(Size::S32, loc_b, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+        let dest = if dest == src1 || dest == src2 {
+            let tmp = self.acquire_temp_gpr().unwrap();
+            temps.push(tmp.clone());
+            self.assembler.emit_mov(Size::S32, dest, Location::GPR(tmp));
+            Location::GPR(tmp)
+        } else {
+            dest
+        };
+        self.assembler
+            .emit_cbz_label(Size::S32, src2, integer_division_by_zero);
+        let offset = self.mark_instruction_with_trap_code(TrapCode::IntegerOverflow);
+        self.assembler.emit_udiv(Size::S32, src1, src2, dest);
+        // unsigned remainder : src1 - (src1/src2)*src2
+        self.assembler.emit_msub(Size::S32, dest, src2, src1, dest);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+        offset
+    }
+    fn emit_binop_srem32(
+        &mut self,
+        loc_a: Location,
+        loc_b: Location,
+        ret: Location,
+        integer_division_by_zero: Label,
+        _integer_overflow: Label,
+    ) -> usize {
+        let mut temps = vec![];
+        let src1 = self.location_to_reg(Size::S32, loc_a, &mut temps, ImmType::None, true, None);
+        let src2 = self.location_to_reg(Size::S32, loc_b, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+        let dest = if dest == src1 || dest == src2 {
+            let tmp = self.acquire_temp_gpr().unwrap();
+            temps.push(tmp.clone());
+            self.assembler.emit_mov(Size::S32, dest, Location::GPR(tmp));
+            Location::GPR(tmp)
+        } else {
+            dest
+        };
+        self.assembler
+            .emit_cbz_label(Size::S32, src2, integer_division_by_zero);
+        let offset = self.mark_instruction_with_trap_code(TrapCode::IntegerOverflow);
+        self.assembler.emit_sdiv(Size::S32, src1, src2, dest);
+        // unsigned remainder : src1 - (src1/src2)*src2
+        self.assembler.emit_msub(Size::S32, dest, src2, src1, dest);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+        offset
+    }
+    fn emit_binop_and32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_and,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Logical32,
+        );
+    }
+    fn emit_binop_or32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_or,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Logical32,
+        );
+    }
+    fn emit_binop_xor32(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_eor,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Logical32,
+        );
+    }
+    fn i32_cmp_ge_s(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i32_dynamic_b(Condition::Ge, loc_a, loc_b, ret);
+    }
+    fn i32_cmp_gt_s(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i32_dynamic_b(Condition::Gt, loc_a, loc_b, ret);
+    }
+    fn i32_cmp_le_s(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i32_dynamic_b(Condition::Le, loc_a, loc_b, ret);
+    }
+    fn i32_cmp_lt_s(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i32_dynamic_b(Condition::Lt, loc_a, loc_b, ret);
+    }
+    fn i32_cmp_ge_u(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i32_dynamic_b(Condition::Cs, loc_a, loc_b, ret);
+    }
+    fn i32_cmp_gt_u(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i32_dynamic_b(Condition::Hi, loc_a, loc_b, ret);
+    }
+    fn i32_cmp_le_u(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i32_dynamic_b(Condition::Ls, loc_a, loc_b, ret);
+    }
+    fn i32_cmp_lt_u(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i32_dynamic_b(Condition::Cc, loc_a, loc_b, ret);
+    }
+    fn i32_cmp_ne(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i32_dynamic_b(Condition::Ne, loc_a, loc_b, ret);
+    }
+    fn i32_cmp_eq(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i32_dynamic_b(Condition::Eq, loc_a, loc_b, ret);
+    }
+    fn i32_clz(&mut self, src: Location, dst: Location) {
+        self.emit_relaxed_binop(Assembler::emit_clz, Size::S32, src, dst, true);
+    }
+    fn i32_ctz(&mut self, src: Location, dst: Location) {
+        let mut temps = vec![];
+        let src = self.location_to_reg(Size::S32, src, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S32, dst, &mut temps, ImmType::None, false, None);
+        self.assembler.emit_rbit(Size::S32, src, dest);
+        self.assembler.emit_clz(Size::S32, dest, dest);
+        if dst != dest {
+            self.move_location(Size::S32, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn i32_popcnt(&mut self, loc: Location, ret: Location) {
+        // no opcode for that.
+        // 2 solutions: using NEON CNT, that count bits per Byte, or using clz with some shift and loop
+        let mut temps = vec![];
+        let src = self.location_to_reg(Size::S32, loc, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+        let src = if src == loc {
+            let tmp = self.acquire_temp_gpr().unwrap();
+            temps.push(tmp.clone());
+            self.assembler.emit_mov(Size::S32, src, Location::GPR(tmp));
+            Location::GPR(tmp)
+        } else {
+            src
+        };
+        let tmp = {
+            let tmp = self.acquire_temp_gpr().unwrap();
+            temps.push(tmp.clone());
+            Location::GPR(tmp)
+        };
+        let label_loop = self.assembler.get_label();
+        let label_exit = self.assembler.get_label();
+        self.assembler
+            .emit_mov(Size::S32, Location::GPR(GPR::XzrSp), dest); // 0 => dest
+        self.assembler.emit_cbz_label(Size::S32, src, label_exit); // src==0, exit
+        self.assembler.emit_label(label_loop); // loop:
+        self.assembler
+            .emit_add(Size::S32, dest, Location::Imm8(1), dest); // inc dest
+        self.assembler.emit_clz(Size::S32, src, tmp); // clz src => tmp
+        self.assembler
+            .emit_add(Size::S32, tmp, Location::Imm8(1), tmp); // inc tmp
+        self.assembler.emit_lsl(Size::S32, src, tmp, src); // src << tmp => src
+        self.assembler.emit_cbnz_label(Size::S32, src, label_loop); // if src!=0 goto loop
+        self.assembler.emit_label(label_exit);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn i32_shl(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_lsl,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Shift32No0,
+        );
+    }
+    fn i32_shr(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_lsr,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Shift32No0,
+        );
+    }
+    fn i32_sar(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_asr,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Shift32No0,
+        );
+    }
+    fn i32_rol(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let src2 = match loc_b {
+            Location::Imm8(imm) => Location::Imm8(32 - (imm & 31)),
+            Location::Imm32(imm) => Location::Imm8(32 - (imm & 31) as u8),
+            Location::Imm64(imm) => Location::Imm8(32 - (imm & 31) as u8),
+            _ => {
+                let tmp1 = self.location_to_reg(
+                    Size::S32,
+                    Location::Imm32(32),
+                    &mut temps,
+                    ImmType::None,
+                    true,
+                    None,
+                );
+                let tmp2 =
+                    self.location_to_reg(Size::S32, loc_b, &mut temps, ImmType::None, true, None);
+                self.assembler.emit_sub(Size::S32, tmp1, tmp2, tmp1);
+                tmp1
+            }
+        };
+        self.emit_relaxed_binop3(
+            Assembler::emit_ror,
+            Size::S32,
+            loc_a,
+            src2,
+            ret,
+            ImmType::Shift32No0,
+        );
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn i32_ror(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_ror,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Shift32No0,
+        );
+    }
+    fn i32_load(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr32(Size::S32, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i32_load_8u(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr8(Size::S32, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i32_load_8s(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr8s(Size::S32, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i32_load_16u(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr16(Size::S32, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i32_load_16s(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr16s(Size::S32, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i32_atomic_load(
+        &mut self,
+        _addr: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i32_atomic_load_8u(
+        &mut self,
+        _addr: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i32_atomic_load_16u(
+        &mut self,
+        _addr: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i32_save(
+        &mut self,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            target_addr,
+            memarg,
+            false,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_str32(target_value, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i32_save_8(
+        &mut self,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            target_addr,
+            memarg,
+            false,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_str8(target_value, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i32_save_16(
+        &mut self,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            target_addr,
+            memarg,
+            false,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_str16(target_value, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i32_atomic_save(
+        &mut self,
+        _value: Location,
+        _memarg: &MemoryImmediate,
+        _target_addr: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i32_atomic_save_8(
+        &mut self,
+        _value: Location,
+        _memarg: &MemoryImmediate,
+        _target_addr: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i32_atomic_save_16(
+        &mut self,
+        _value: Location,
+        _memarg: &MemoryImmediate,
+        _target_addr: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Add with i32
+    fn i32_atomic_add(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Add with u8
+    fn i32_atomic_add_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Add with u16
+    fn i32_atomic_add_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Sub with i32
+    fn i32_atomic_sub(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Sub with u8
+    fn i32_atomic_sub_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Sub with u16
+    fn i32_atomic_sub_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic And with i32
+    fn i32_atomic_and(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic And with u8
+    fn i32_atomic_and_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic And with u16
+    fn i32_atomic_and_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Or with i32
+    fn i32_atomic_or(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Or with u8
+    fn i32_atomic_or_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Or with u16
+    fn i32_atomic_or_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Xor with i32
+    fn i32_atomic_xor(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Xor with u8
+    fn i32_atomic_xor_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Xor with u16
+    fn i32_atomic_xor_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Exchange with i32
+    fn i32_atomic_xchg(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Exchange with u8
+    fn i32_atomic_xchg_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Exchange with u16
+    fn i32_atomic_xchg_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Exchange with i32
+    fn i32_atomic_cmpxchg(
+        &mut self,
+        _new: Location,
+        _cmp: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Exchange with u8
+    fn i32_atomic_cmpxchg_8u(
+        &mut self,
+        _new: Location,
+        _cmp: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i32 atomic Exchange with u16
+    fn i32_atomic_cmpxchg_16u(
+        &mut self,
+        _new: Location,
+        _cmp: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+
+    fn move_with_reloc(
+        &mut self,
+        reloc_target: RelocationTarget,
+        relocations: &mut Vec<Relocation>,
+    ) {
+        let reloc_at = self.assembler.get_offset().0;
+        relocations.push(Relocation {
+            kind: RelocationKind::Arm64Movw0,
+            reloc_target,
+            offset: reloc_at as u32,
+            addend: 0,
+        });
+        self.assembler.emit_movz(Location::GPR(GPR::X27), 0);
+        let reloc_at = self.assembler.get_offset().0;
+        relocations.push(Relocation {
+            kind: RelocationKind::Arm64Movw1,
+            reloc_target,
+            offset: reloc_at as u32,
+            addend: 0,
+        });
+        self.assembler.emit_movk(Location::GPR(GPR::X27), 0, 16);
+        let reloc_at = self.assembler.get_offset().0;
+        relocations.push(Relocation {
+            kind: RelocationKind::Arm64Movw2,
+            reloc_target,
+            offset: reloc_at as u32,
+            addend: 0,
+        });
+        self.assembler.emit_movk(Location::GPR(GPR::X27), 0, 32);
+        let reloc_at = self.assembler.get_offset().0;
+        relocations.push(Relocation {
+            kind: RelocationKind::Arm64Movw3,
+            reloc_target,
+            offset: reloc_at as u32,
+            addend: 0,
+        });
+        self.assembler.emit_movk(Location::GPR(GPR::X27), 0, 48);
+    }
+
+    fn emit_binop_add64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_add,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Bits12,
+        );
+    }
+    fn emit_binop_sub64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_sub,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Bits12,
+        );
+    }
+    fn emit_binop_mul64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_mul,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+    }
+    fn emit_binop_udiv64(
+        &mut self,
+        loc_a: Location,
+        loc_b: Location,
+        ret: Location,
+        integer_division_by_zero: Label,
+        _integer_overflow: Label,
+    ) -> usize {
+        let mut temps = vec![];
+        let src1 = self.location_to_reg(Size::S64, loc_a, &mut temps, ImmType::None, true, None);
+        let src2 = self.location_to_reg(Size::S64, loc_b, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+
+        self.assembler
+            .emit_cbz_label(Size::S64, src2, integer_division_by_zero);
+        let offset = self.mark_instruction_with_trap_code(TrapCode::IntegerOverflow);
+        self.assembler.emit_udiv(Size::S64, src1, src2, dest);
+        if ret != dest {
+            self.move_location(Size::S64, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+        offset
+    }
+    fn emit_binop_sdiv64(
+        &mut self,
+        loc_a: Location,
+        loc_b: Location,
+        ret: Location,
+        integer_division_by_zero: Label,
+        integer_overflow: Label,
+    ) -> usize {
+        let mut temps = vec![];
+        let src1 = self.location_to_reg(Size::S64, loc_a, &mut temps, ImmType::None, true, None);
+        let src2 = self.location_to_reg(Size::S64, loc_b, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+
+        self.assembler
+            .emit_cbz_label(Size::S64, src2, integer_division_by_zero);
+        let label_nooverflow = self.assembler.get_label();
+        let tmp = self.location_to_reg(
+            Size::S64,
+            Location::Imm64(0x8000000000000000),
+            &mut temps,
+            ImmType::None,
+            true,
+            None,
+        );
+        self.assembler.emit_cmp(Size::S64, tmp, src1);
+        self.assembler
+            .emit_bcond_label(Condition::Ne, label_nooverflow);
+        self.assembler.emit_movn(Size::S64, tmp, 0);
+        self.assembler.emit_cmp(Size::S64, tmp, src2);
+        self.assembler
+            .emit_bcond_label(Condition::Eq, integer_overflow);
+        let offset = self.mark_instruction_with_trap_code(TrapCode::IntegerOverflow);
+        self.assembler.emit_label(label_nooverflow);
+        self.assembler.emit_sdiv(Size::S64, src1, src2, dest);
+        if ret != dest {
+            self.move_location(Size::S64, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+        offset
+    }
+    fn emit_binop_urem64(
+        &mut self,
+        loc_a: Location,
+        loc_b: Location,
+        ret: Location,
+        integer_division_by_zero: Label,
+        _integer_overflow: Label,
+    ) -> usize {
+        let mut temps = vec![];
+        let src1 = self.location_to_reg(Size::S64, loc_a, &mut temps, ImmType::None, true, None);
+        let src2 = self.location_to_reg(Size::S64, loc_b, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+        let dest = if dest == src1 || dest == src2 {
+            let tmp = self.acquire_temp_gpr().unwrap();
+            temps.push(tmp.clone());
+            self.assembler.emit_mov(Size::S32, dest, Location::GPR(tmp));
+            Location::GPR(tmp)
+        } else {
+            dest
+        };
+        self.assembler
+            .emit_cbz_label(Size::S64, src2, integer_division_by_zero);
+        let offset = self.mark_instruction_with_trap_code(TrapCode::IntegerOverflow);
+        self.assembler.emit_udiv(Size::S64, src1, src2, dest);
+        // unsigned remainder : src1 - (src1/src2)*src2
+        self.assembler.emit_msub(Size::S64, dest, src2, src1, dest);
+        if ret != dest {
+            self.move_location(Size::S64, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+        offset
+    }
+    fn emit_binop_srem64(
+        &mut self,
+        loc_a: Location,
+        loc_b: Location,
+        ret: Location,
+        integer_division_by_zero: Label,
+        _integer_overflow: Label,
+    ) -> usize {
+        let mut temps = vec![];
+        let src1 = self.location_to_reg(Size::S64, loc_a, &mut temps, ImmType::None, true, None);
+        let src2 = self.location_to_reg(Size::S64, loc_b, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+        let dest = if dest == src1 || dest == src2 {
+            let tmp = self.acquire_temp_gpr().unwrap();
+            temps.push(tmp.clone());
+            self.assembler.emit_mov(Size::S64, dest, Location::GPR(tmp));
+            Location::GPR(tmp)
+        } else {
+            dest
+        };
+        self.assembler
+            .emit_cbz_label(Size::S64, src2, integer_division_by_zero);
+        let offset = self.mark_instruction_with_trap_code(TrapCode::IntegerOverflow);
+        self.assembler.emit_sdiv(Size::S64, src1, src2, dest);
+        // unsigned remainder : src1 - (src1/src2)*src2
+        self.assembler.emit_msub(Size::S64, dest, src2, src1, dest);
+        if ret != dest {
+            self.move_location(Size::S64, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+        offset
+    }
+    fn emit_binop_and64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_and,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Logical64,
+        );
+    }
+    fn emit_binop_or64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_or,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Logical64,
+        );
+    }
+    fn emit_binop_xor64(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_eor,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Logical64,
+        );
+    }
+    fn i64_cmp_ge_s(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i64_dynamic_b(Condition::Ge, loc_a, loc_b, ret);
+    }
+    fn i64_cmp_gt_s(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i64_dynamic_b(Condition::Gt, loc_a, loc_b, ret);
+    }
+    fn i64_cmp_le_s(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i64_dynamic_b(Condition::Le, loc_a, loc_b, ret);
+    }
+    fn i64_cmp_lt_s(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i64_dynamic_b(Condition::Lt, loc_a, loc_b, ret);
+    }
+    fn i64_cmp_ge_u(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i64_dynamic_b(Condition::Cs, loc_a, loc_b, ret);
+    }
+    fn i64_cmp_gt_u(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i64_dynamic_b(Condition::Hi, loc_a, loc_b, ret);
+    }
+    fn i64_cmp_le_u(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i64_dynamic_b(Condition::Ls, loc_a, loc_b, ret);
+    }
+    fn i64_cmp_lt_u(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i64_dynamic_b(Condition::Cc, loc_a, loc_b, ret);
+    }
+    fn i64_cmp_ne(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i64_dynamic_b(Condition::Ne, loc_a, loc_b, ret);
+    }
+    fn i64_cmp_eq(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_cmpop_i64_dynamic_b(Condition::Eq, loc_a, loc_b, ret);
+    }
+    fn i64_clz(&mut self, src: Location, dst: Location) {
+        self.emit_relaxed_binop(Assembler::emit_clz, Size::S64, src, dst, true);
+    }
+    fn i64_ctz(&mut self, src: Location, dst: Location) {
+        let mut temps = vec![];
+        let src = self.location_to_reg(Size::S64, src, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S64, dst, &mut temps, ImmType::None, false, None);
+        self.assembler.emit_rbit(Size::S64, src, dest);
+        self.assembler.emit_clz(Size::S64, dest, dest);
+        if dst != dest {
+            self.move_location(Size::S64, dest, dst);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn i64_popcnt(&mut self, loc: Location, ret: Location) {
+        let mut temps = vec![];
+        let src = self.location_to_reg(Size::S64, loc, &mut temps, ImmType::None, true, None);
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+        let src = if src == loc {
+            let tmp = self.acquire_temp_gpr().unwrap();
+            temps.push(tmp.clone());
+            self.assembler.emit_mov(Size::S64, src, Location::GPR(tmp));
+            Location::GPR(tmp)
+        } else {
+            src
+        };
+        let tmp = {
+            let tmp = self.acquire_temp_gpr().unwrap();
+            temps.push(tmp.clone());
+            Location::GPR(tmp)
+        };
+        let label_loop = self.assembler.get_label();
+        let label_exit = self.assembler.get_label();
+        self.assembler
+            .emit_mov(Size::S32, Location::GPR(GPR::XzrSp), dest);
+        self.assembler.emit_cbz_label(Size::S64, src, label_exit);
+        self.assembler.emit_label(label_loop);
+        self.assembler
+            .emit_add(Size::S32, dest, Location::Imm8(1), dest);
+        self.assembler.emit_clz(Size::S64, src, tmp);
+        self.assembler
+            .emit_add(Size::S32, tmp, Location::Imm8(1), tmp);
+        self.assembler.emit_lsl(Size::S64, src, tmp, src);
+        self.assembler.emit_cbnz_label(Size::S64, src, label_loop);
+        self.assembler.emit_label(label_exit);
+        if ret != dest {
+            self.move_location(Size::S64, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn i64_shl(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_lsl,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Shift64No0,
+        );
+    }
+    fn i64_shr(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_lsr,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Shift64No0,
+        );
+    }
+    fn i64_sar(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_asr,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Shift64No0,
+        );
+    }
+    fn i64_rol(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        // there is no ROL on ARM64. We use ROR with 64-value instead
+        let mut temps = vec![];
+        let src2 = match loc_b {
+            Location::Imm8(imm) => Location::Imm8(64 - (imm & 63)),
+            Location::Imm32(imm) => Location::Imm8(64 - (imm & 63) as u8),
+            Location::Imm64(imm) => Location::Imm8(64 - (imm & 63) as u8),
+            _ => {
+                let tmp1 = self.location_to_reg(
+                    Size::S64,
+                    Location::Imm32(64),
+                    &mut temps,
+                    ImmType::None,
+                    true,
+                    None,
+                );
+                let tmp2 =
+                    self.location_to_reg(Size::S64, loc_b, &mut temps, ImmType::None, true, None);
+                self.assembler.emit_sub(Size::S64, tmp1, tmp2, tmp1);
+                tmp1
+            }
+        };
+        self.emit_relaxed_binop3(
+            Assembler::emit_ror,
+            Size::S64,
+            loc_a,
+            src2,
+            ret,
+            ImmType::Shift64No0,
+        );
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn i64_ror(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3(
+            Assembler::emit_ror,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::Shift64No0,
+        );
+    }
+    fn i64_load(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr64(Size::S64, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_load_8u(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr8(Size::S64, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_load_8s(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr8s(Size::S64, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_load_16u(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr16(Size::S64, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_load_16s(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr16s(Size::S64, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_load_32u(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr32(Size::S64, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_load_32s(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_ldr32s(Size::S64, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_atomic_load(
+        &mut self,
+        _addr: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i64_atomic_load_8u(
+        &mut self,
+        _addr: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i64_atomic_load_16u(
+        &mut self,
+        _addr: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i64_atomic_load_32u(
+        &mut self,
+        _addr: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i64_save(
+        &mut self,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            target_addr,
+            memarg,
+            false,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_str64(target_value, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_save_8(
+        &mut self,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            target_addr,
+            memarg,
+            false,
+            1,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_str8(target_value, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_save_16(
+        &mut self,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            target_addr,
+            memarg,
+            false,
+            2,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_str16(target_value, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_save_32(
+        &mut self,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            target_addr,
+            memarg,
+            false,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.emit_relaxed_str32(target_value, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn i64_atomic_save(
+        &mut self,
+        _value: Location,
+        _memarg: &MemoryImmediate,
+        _target_addr: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i64_atomic_save_8(
+        &mut self,
+        _value: Location,
+        _memarg: &MemoryImmediate,
+        _target_addr: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i64_atomic_save_16(
+        &mut self,
+        _value: Location,
+        _memarg: &MemoryImmediate,
+        _target_addr: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    fn i64_atomic_save_32(
+        &mut self,
+        _value: Location,
+        _memarg: &MemoryImmediate,
+        _target_addr: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Add with i64
+    fn i64_atomic_add(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Add with u8
+    fn i64_atomic_add_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Add with u16
+    fn i64_atomic_add_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Add with u32
+    fn i64_atomic_add_32u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Sub with i64
+    fn i64_atomic_sub(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Sub with u8
+    fn i64_atomic_sub_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Sub with u16
+    fn i64_atomic_sub_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Sub with u32
+    fn i64_atomic_sub_32u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic And with i64
+    fn i64_atomic_and(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic And with u8
+    fn i64_atomic_and_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic And with u16
+    fn i64_atomic_and_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic And with u32
+    fn i64_atomic_and_32u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Or with i64
+    fn i64_atomic_or(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Or with u8
+    fn i64_atomic_or_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Or with u16
+    fn i64_atomic_or_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Or with u32
+    fn i64_atomic_or_32u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic xor with i64
+    fn i64_atomic_xor(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic xor with u8
+    fn i64_atomic_xor_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic xor with u16
+    fn i64_atomic_xor_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic xor with u32
+    fn i64_atomic_xor_32u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Exchange with i64
+    fn i64_atomic_xchg(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Exchange with u8
+    fn i64_atomic_xchg_8u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Exchange with u16
+    fn i64_atomic_xchg_16u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Exchange with u32
+    fn i64_atomic_xchg_32u(
+        &mut self,
+        _loc: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Exchange with i64
+    fn i64_atomic_cmpxchg(
+        &mut self,
+        _new: Location,
+        _cmp: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Exchange with u8
+    fn i64_atomic_cmpxchg_8u(
+        &mut self,
+        _new: Location,
+        _cmp: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Exchange with u16
+    fn i64_atomic_cmpxchg_16u(
+        &mut self,
+        _new: Location,
+        _cmp: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+    // i64 atomic Exchange with u32
+    fn i64_atomic_cmpxchg_32u(
+        &mut self,
+        _new: Location,
+        _cmp: Location,
+        _target: Location,
+        _memarg: &MemoryImmediate,
+        _ret: Location,
+        _need_check: bool,
+        _imported_memories: bool,
+        _offset: i32,
+        _heap_access_oob: Label,
+    ) {
+        unimplemented!();
+    }
+
+    fn f32_load(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.assembler
+                    .emit_ldr(Size::S32, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn f32_save(
+        &mut self,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        canonicalize: bool,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        let canonicalize = canonicalize && self.arch_supports_canonicalize_nan();
+        self.memory_op(
+            target_addr,
+            memarg,
+            false,
+            4,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                if !canonicalize {
+                    this.emit_relaxed_str32(target_value, Location::Memory(addr, 0));
+                } else {
+                    this.canonicalize_nan(Size::S32, target_value, Location::Memory(addr, 0));
+                }
+            },
+        );
+    }
+    fn f64_load(
+        &mut self,
+        addr: Location,
+        memarg: &MemoryImmediate,
+        ret: Location,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        self.memory_op(
+            addr,
+            memarg,
+            false,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                this.assembler
+                    .emit_ldr(Size::S64, ret, Location::Memory(addr, 0));
+            },
+        );
+    }
+    fn f64_save(
+        &mut self,
+        target_value: Location,
+        memarg: &MemoryImmediate,
+        target_addr: Location,
+        canonicalize: bool,
+        need_check: bool,
+        imported_memories: bool,
+        offset: i32,
+        heap_access_oob: Label,
+    ) {
+        let canonicalize = canonicalize && self.arch_supports_canonicalize_nan();
+        self.memory_op(
+            target_addr,
+            memarg,
+            false,
+            8,
+            need_check,
+            imported_memories,
+            offset,
+            heap_access_oob,
+            |this, addr| {
+                if !canonicalize {
+                    this.emit_relaxed_str64(target_value, Location::Memory(addr, 0));
+                } else {
+                    this.canonicalize_nan(Size::S64, target_value, Location::Memory(addr, 0));
+                }
+            },
+        );
+    }
+
+    fn convert_f64_i64(&mut self, loc: Location, signed: bool, ret: Location) {
+        let mut gprs = vec![];
+        let mut neons = vec![];
+        let src = self.location_to_reg(Size::S64, loc, &mut gprs, ImmType::NoneXzr, true, None);
+        let dest = self.location_to_neon(Size::S64, ret, &mut neons, ImmType::None, false);
+        if signed {
+            self.assembler.emit_scvtf(Size::S64, src, Size::S64, dest);
+        } else {
+            self.assembler.emit_ucvtf(Size::S64, src, Size::S64, dest);
+        }
+        if ret != dest {
+            self.move_location(Size::S64, dest, ret);
+        }
+        for r in gprs {
+            self.release_gpr(r);
+        }
+        for r in neons {
+            self.release_simd(r);
+        }
+    }
+    fn convert_f64_i32(&mut self, loc: Location, signed: bool, ret: Location) {
+        let mut gprs = vec![];
+        let mut neons = vec![];
+        let src = self.location_to_reg(Size::S32, loc, &mut gprs, ImmType::NoneXzr, true, None);
+        let dest = self.location_to_neon(Size::S64, ret, &mut neons, ImmType::None, false);
+        if signed {
+            self.assembler.emit_scvtf(Size::S32, src, Size::S64, dest);
+        } else {
+            self.assembler.emit_ucvtf(Size::S32, src, Size::S64, dest);
+        }
+        if ret != dest {
+            self.move_location(Size::S64, dest, ret);
+        }
+        for r in gprs {
+            self.release_gpr(r);
+        }
+        for r in neons {
+            self.release_simd(r);
+        }
+    }
+    fn convert_f32_i64(&mut self, loc: Location, signed: bool, ret: Location) {
+        let mut gprs = vec![];
+        let mut neons = vec![];
+        let src = self.location_to_reg(Size::S64, loc, &mut gprs, ImmType::NoneXzr, true, None);
+        let dest = self.location_to_neon(Size::S32, ret, &mut neons, ImmType::None, false);
+        if signed {
+            self.assembler.emit_scvtf(Size::S64, src, Size::S32, dest);
+        } else {
+            self.assembler.emit_ucvtf(Size::S64, src, Size::S32, dest);
+        }
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in gprs {
+            self.release_gpr(r);
+        }
+        for r in neons {
+            self.release_simd(r);
+        }
+    }
+    fn convert_f32_i32(&mut self, loc: Location, signed: bool, ret: Location) {
+        let mut gprs = vec![];
+        let mut neons = vec![];
+        let src = self.location_to_reg(Size::S32, loc, &mut gprs, ImmType::NoneXzr, true, None);
+        let dest = self.location_to_neon(Size::S32, ret, &mut neons, ImmType::None, false);
+        if signed {
+            self.assembler.emit_scvtf(Size::S32, src, Size::S32, dest);
+        } else {
+            self.assembler.emit_ucvtf(Size::S32, src, Size::S32, dest);
+        }
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in gprs {
+            self.release_gpr(r);
+        }
+        for r in neons {
+            self.release_simd(r);
+        }
+    }
+    fn convert_i64_f64(&mut self, loc: Location, ret: Location, signed: bool, sat: bool) {
+        let mut gprs = vec![];
+        let mut neons = vec![];
+        let src = self.location_to_neon(Size::S64, loc, &mut neons, ImmType::None, true);
+        let dest = self.location_to_reg(Size::S64, ret, &mut gprs, ImmType::None, false, None);
+        let old_fpcr = if !sat {
+            self.reset_exception_fpsr();
+            self.set_trap_enabled(&mut gprs)
+        } else {
+            GPR::XzrSp
+        };
+        if signed {
+            self.assembler.emit_fcvtzs(Size::S64, src, Size::S64, dest);
+        } else {
+            self.assembler.emit_fcvtzu(Size::S64, src, Size::S64, dest);
+        }
+        if !sat {
+            self.trap_float_convertion_errors(old_fpcr, Size::S64, src, &mut gprs);
+        }
+        if ret != dest {
+            self.move_location(Size::S64, dest, ret);
+        }
+        for r in gprs {
+            self.release_gpr(r);
+        }
+        for r in neons {
+            self.release_simd(r);
+        }
+    }
+    fn convert_i32_f64(&mut self, loc: Location, ret: Location, signed: bool, sat: bool) {
+        let mut gprs = vec![];
+        let mut neons = vec![];
+        let src = self.location_to_neon(Size::S64, loc, &mut neons, ImmType::None, true);
+        let dest = self.location_to_reg(Size::S32, ret, &mut gprs, ImmType::None, false, None);
+        let old_fpcr = if !sat {
+            self.reset_exception_fpsr();
+            self.set_trap_enabled(&mut gprs)
+        } else {
+            GPR::XzrSp
+        };
+        if signed {
+            self.assembler.emit_fcvtzs(Size::S64, src, Size::S32, dest);
+        } else {
+            self.assembler.emit_fcvtzu(Size::S64, src, Size::S32, dest);
+        }
+        if !sat {
+            self.trap_float_convertion_errors(old_fpcr, Size::S64, src, &mut gprs);
+        }
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in gprs {
+            self.release_gpr(r);
+        }
+        for r in neons {
+            self.release_simd(r);
+        }
+    }
+    fn convert_i64_f32(&mut self, loc: Location, ret: Location, signed: bool, sat: bool) {
+        let mut gprs = vec![];
+        let mut neons = vec![];
+        let src = self.location_to_neon(Size::S32, loc, &mut neons, ImmType::None, true);
+        let dest = self.location_to_reg(Size::S64, ret, &mut gprs, ImmType::None, false, None);
+        let old_fpcr = if !sat {
+            self.reset_exception_fpsr();
+            self.set_trap_enabled(&mut gprs)
+        } else {
+            GPR::XzrSp
+        };
+        if signed {
+            self.assembler.emit_fcvtzs(Size::S32, src, Size::S64, dest);
+        } else {
+            self.assembler.emit_fcvtzu(Size::S32, src, Size::S64, dest);
+        }
+        if !sat {
+            self.trap_float_convertion_errors(old_fpcr, Size::S32, src, &mut gprs);
+        }
+        if ret != dest {
+            self.move_location(Size::S64, dest, ret);
+        }
+        for r in gprs {
+            self.release_gpr(r);
+        }
+        for r in neons {
+            self.release_simd(r);
+        }
+    }
+    fn convert_i32_f32(&mut self, loc: Location, ret: Location, signed: bool, sat: bool) {
+        let mut gprs = vec![];
+        let mut neons = vec![];
+        let src = self.location_to_neon(Size::S32, loc, &mut neons, ImmType::None, true);
+        let dest = self.location_to_reg(Size::S32, ret, &mut gprs, ImmType::None, false, None);
+        let old_fpcr = if !sat {
+            self.reset_exception_fpsr();
+            self.set_trap_enabled(&mut gprs)
+        } else {
+            GPR::XzrSp
+        };
+        if signed {
+            self.assembler.emit_fcvtzs(Size::S32, src, Size::S32, dest);
+        } else {
+            self.assembler.emit_fcvtzu(Size::S32, src, Size::S32, dest);
+        }
+        if !sat {
+            self.trap_float_convertion_errors(old_fpcr, Size::S32, src, &mut gprs);
+        }
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in gprs {
+            self.release_gpr(r);
+        }
+        for r in neons {
+            self.release_simd(r);
+        }
+    }
+    fn convert_f64_f32(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_fcvt, Size::S32, loc, ret, true);
+    }
+    fn convert_f32_f64(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_fcvt, Size::S64, loc, ret, true);
+    }
+    fn f64_neg(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_fneg, Size::S64, loc, ret, true);
+    }
+    fn f64_abs(&mut self, loc: Location, ret: Location) {
+        let tmp = self.acquire_temp_gpr().unwrap();
+
+        self.move_location(Size::S64, loc, Location::GPR(tmp));
+        self.assembler.emit_and(
+            Size::S64,
+            Location::GPR(tmp),
+            Location::Imm64(0x7fffffffffffffffu64),
+            Location::GPR(tmp),
+        );
+        self.move_location(Size::S64, Location::GPR(tmp), ret);
+
+        self.release_gpr(tmp);
+    }
+    fn emit_i64_copysign(&mut self, tmp1: GPR, tmp2: GPR) {
+        self.assembler.emit_and(
+            Size::S64,
+            Location::GPR(tmp1),
+            Location::Imm64(0x7fffffffffffffffu64),
+            Location::GPR(tmp1),
+        );
+
+        self.assembler.emit_and(
+            Size::S64,
+            Location::GPR(tmp2),
+            Location::Imm64(0x8000000000000000u64),
+            Location::GPR(tmp2),
+        );
+
+        self.assembler.emit_or(
+            Size::S64,
+            Location::GPR(tmp1),
+            Location::GPR(tmp2),
+            Location::GPR(tmp1),
+        );
+    }
+    fn f64_sqrt(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_fsqrt, Size::S64, loc, ret, true);
+    }
+    fn f64_trunc(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_frintz, Size::S64, loc, ret, true);
+    }
+    fn f64_ceil(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_frintp, Size::S64, loc, ret, true);
+    }
+    fn f64_floor(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_frintm, Size::S64, loc, ret, true);
+    }
+    fn f64_nearest(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_frintn, Size::S64, loc, ret, true);
+    }
+    fn f64_cmp_ge(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S64, loc_b, loc_a, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Ls);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f64_cmp_gt(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S64, loc_b, loc_a, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Cc);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f64_cmp_le(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S64, loc_a, loc_b, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Ls);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f64_cmp_lt(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S64, loc_a, loc_b, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Cc);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f64_cmp_ne(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S64, loc_a, loc_b, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Ne);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f64_cmp_eq(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S64, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S64, loc_a, loc_b, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Eq);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f64_min(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let old_fpcr = self.set_default_nan(&mut temps);
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fmin,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+        self.restore_fpcr(old_fpcr);
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f64_max(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let old_fpcr = self.set_default_nan(&mut temps);
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fmax,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+        self.restore_fpcr(old_fpcr);
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f64_add(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fadd,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+    }
+    fn f64_sub(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fsub,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+    }
+    fn f64_mul(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fmul,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+    }
+    fn f64_div(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fdiv,
+            Size::S64,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+    }
+    fn f32_neg(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_fneg, Size::S32, loc, ret, true);
+    }
+    fn f32_abs(&mut self, loc: Location, ret: Location) {
+        let tmp = self.acquire_temp_gpr().unwrap();
+        self.move_location(Size::S32, loc, Location::GPR(tmp));
+        self.assembler.emit_and(
+            Size::S32,
+            Location::GPR(tmp),
+            Location::Imm32(0x7fffffffu32),
+            Location::GPR(tmp),
+        );
+        self.move_location(Size::S32, Location::GPR(tmp), ret);
+        self.release_gpr(tmp);
+    }
+    fn emit_i32_copysign(&mut self, tmp1: GPR, tmp2: GPR) {
+        self.assembler.emit_and(
+            Size::S32,
+            Location::GPR(tmp1),
+            Location::Imm32(0x7fffffffu32),
+            Location::GPR(tmp1),
+        );
+        self.assembler.emit_and(
+            Size::S32,
+            Location::GPR(tmp2),
+            Location::Imm32(0x80000000u32),
+            Location::GPR(tmp2),
+        );
+        self.assembler.emit_or(
+            Size::S32,
+            Location::GPR(tmp1),
+            Location::GPR(tmp2),
+            Location::GPR(tmp1),
+        );
+    }
+    fn f32_sqrt(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_fsqrt, Size::S32, loc, ret, true);
+    }
+    fn f32_trunc(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_frintz, Size::S32, loc, ret, true);
+    }
+    fn f32_ceil(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_frintp, Size::S32, loc, ret, true);
+    }
+    fn f32_floor(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_frintm, Size::S32, loc, ret, true);
+    }
+    fn f32_nearest(&mut self, loc: Location, ret: Location) {
+        self.emit_relaxed_binop_neon(Assembler::emit_frintn, Size::S32, loc, ret, true);
+    }
+    fn f32_cmp_ge(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S32, loc_b, loc_a, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Ls);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f32_cmp_gt(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S32, loc_b, loc_a, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Cc);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f32_cmp_le(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S32, loc_a, loc_b, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Ls);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f32_cmp_lt(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S32, loc_a, loc_b, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Cc);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f32_cmp_ne(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S32, loc_a, loc_b, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Ne);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f32_cmp_eq(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let dest = self.location_to_reg(Size::S32, ret, &mut temps, ImmType::None, false, None);
+        self.emit_relaxed_binop_neon(Assembler::emit_fcmp, Size::S32, loc_a, loc_b, false);
+        self.assembler.emit_cset(Size::S32, dest, Condition::Eq);
+        if ret != dest {
+            self.move_location(Size::S32, dest, ret);
+        }
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f32_min(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let old_fpcr = self.set_default_nan(&mut temps);
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fmin,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+        self.restore_fpcr(old_fpcr);
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f32_max(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        let mut temps = vec![];
+        let old_fpcr = self.set_default_nan(&mut temps);
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fmax,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+        self.restore_fpcr(old_fpcr);
+        for r in temps {
+            self.release_gpr(r);
+        }
+    }
+    fn f32_add(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fadd,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+    }
+    fn f32_sub(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fsub,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+    }
+    fn f32_mul(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fmul,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+    }
+    fn f32_div(&mut self, loc_a: Location, loc_b: Location, ret: Location) {
+        self.emit_relaxed_binop3_neon(
+            Assembler::emit_fdiv,
+            Size::S32,
+            loc_a,
+            loc_b,
+            ret,
+            ImmType::None,
+        );
+    }
+
+    fn gen_std_trampoline(
+        &self,
+        sig: &FunctionType,
+        calling_convention: CallingConvention,
+    ) -> FunctionBody {
+        gen_std_trampoline_arm64(sig, calling_convention)
+    }
+    // Generates dynamic import function call trampoline for a function type.
+    fn gen_std_dynamic_import_trampoline(
+        &self,
+        vmoffsets: &VMOffsets,
+        sig: &FunctionType,
+        calling_convention: CallingConvention,
+    ) -> FunctionBody {
+        gen_std_dynamic_import_trampoline_arm64(vmoffsets, sig, calling_convention)
+    }
+    // Singlepass calls import functions through a trampoline.
+    fn gen_import_call_trampoline(
+        &self,
+        vmoffsets: &VMOffsets,
+        index: FunctionIndex,
+        sig: &FunctionType,
+        calling_convention: CallingConvention,
+    ) -> CustomSection {
+        gen_import_call_trampoline_arm64(vmoffsets, index, sig, calling_convention)
+    }
+}
diff --git a/lib/compiler-singlepass/src/machine_x64.rs b/lib/compiler-singlepass/src/machine_x64.rs
index e2e13a4c7d3..2d1c0f5f50d 100644
--- a/lib/compiler-singlepass/src/machine_x64.rs
+++ b/lib/compiler-singlepass/src/machine_x64.rs
@@ -1,8 +1,7 @@
 use crate::common_decl::*;
 use crate::emitter_x64::*;
 use crate::location::Location as AbstractLocation;
-use crate::machine::Machine;
-use crate::machine::{MemoryImmediate, TrapTable};
+use crate::machine::*;
 use crate::x64_decl::new_machine_state;
 use crate::x64_decl::{ArgumentRegisterAllocator, X64Register, GPR, XMM};
 use dynasmrt::{x64::X64Relocation, VecAssembler};
@@ -1627,11 +1626,12 @@ impl Machine for MachineX86_64 {
         self.used_gprs.insert(gpr);
     }
 
-    fn push_used_gpr(&mut self) {
+    fn push_used_gpr(&mut self) -> usize {
         let used_gprs = self.get_used_gprs();
         for r in used_gprs.iter() {
             self.assembler.emit_push(Size::S64, Location::GPR(*r));
         }
+        used_gprs.len() * 8
     }
     fn pop_used_gpr(&mut self) {
         let used_gprs = self.get_used_gprs();
@@ -1682,7 +1682,7 @@ impl Machine for MachineX86_64 {
         assert_eq!(self.used_simd.remove(&simd), true);
     }
 
-    fn push_used_simd(&mut self) {
+    fn push_used_simd(&mut self) -> usize {
         let used_xmms = self.get_used_simd();
         self.adjust_stack((used_xmms.len() * 8) as u32);
 
@@ -1693,6 +1693,8 @@ impl Machine for MachineX86_64 {
                 Location::Memory(GPR::RSP, (i * 8) as i32),
             );
         }
+
+        used_xmms.len() * 8
     }
     fn pop_used_simd(&mut self) {
         let used_xmms = self.get_used_simd();
@@ -1775,6 +1777,11 @@ impl Machine for MachineX86_64 {
         Location::Memory(GPR::RBP, -stack_offset)
     }
 
+    // Return a rounded stack adjustement value (must be multiple of 16bytes on ARM64 for example)
+    fn round_stack_adjust(&self, value: usize) -> usize {
+        value
+    }
+
     // Adjust stack for locals
     fn adjust_stack(&mut self, delta_stack_offset: u32) {
         self.assembler.emit_sub(
@@ -1791,11 +1798,6 @@ impl Machine for MachineX86_64 {
             Location::GPR(GPR::RSP),
         );
     }
-    fn push_callee_saved(&mut self) {}
-    fn pop_callee_saved(&mut self) {
-        self.assembler.emit_pop(Size::S64, Location::GPR(GPR::R14));
-        self.assembler.emit_pop(Size::S64, Location::GPR(GPR::R15));
-    }
     fn pop_stack_locals(&mut self, delta_stack_offset: u32) {
         self.assembler.emit_add(
             Size::S64,
@@ -1881,14 +1883,80 @@ impl Machine for MachineX86_64 {
     }
 
     // Get param location
-    fn get_param_location(&self, idx: usize, calling_convention: CallingConvention) -> Location {
+    fn get_param_location(
+        &self,
+        idx: usize,
+        _sz: Size,
+        stack_location: &mut usize,
+        calling_convention: CallingConvention,
+    ) -> Location {
+        match calling_convention {
+            CallingConvention::WindowsFastcall => match idx {
+                0 => Location::GPR(GPR::RCX),
+                1 => Location::GPR(GPR::RDX),
+                2 => Location::GPR(GPR::R8),
+                3 => Location::GPR(GPR::R9),
+                _ => {
+                    let loc = Location::Memory(GPR::RSP, *stack_location as i32);
+                    *stack_location += 8;
+                    loc
+                }
+            },
+            _ => match idx {
+                0 => Location::GPR(GPR::RDI),
+                1 => Location::GPR(GPR::RSI),
+                2 => Location::GPR(GPR::RDX),
+                3 => Location::GPR(GPR::RCX),
+                4 => Location::GPR(GPR::R8),
+                5 => Location::GPR(GPR::R9),
+                _ => {
+                    let loc = Location::Memory(GPR::RSP, *stack_location as i32);
+                    *stack_location += 8;
+                    loc
+                }
+            },
+        }
+    }
+    // Get call param location
+    fn get_call_param_location(
+        &self,
+        idx: usize,
+        _sz: Size,
+        _stack_location: &mut usize,
+        calling_convention: CallingConvention,
+    ) -> Location {
+        match calling_convention {
+            CallingConvention::WindowsFastcall => match idx {
+                0 => Location::GPR(GPR::RCX),
+                1 => Location::GPR(GPR::RDX),
+                2 => Location::GPR(GPR::R8),
+                3 => Location::GPR(GPR::R9),
+                _ => Location::Memory(GPR::RBP, (32 + 16 + (idx - 4) * 8) as i32),
+            },
+            _ => match idx {
+                0 => Location::GPR(GPR::RDI),
+                1 => Location::GPR(GPR::RSI),
+                2 => Location::GPR(GPR::RDX),
+                3 => Location::GPR(GPR::RCX),
+                4 => Location::GPR(GPR::R8),
+                5 => Location::GPR(GPR::R9),
+                _ => Location::Memory(GPR::RBP, (16 + (idx - 6) * 8) as i32),
+            },
+        }
+    }
+    // Get simple param location
+    fn get_simple_param_location(
+        &self,
+        idx: usize,
+        calling_convention: CallingConvention,
+    ) -> Location {
         match calling_convention {
             CallingConvention::WindowsFastcall => match idx {
                 0 => Location::GPR(GPR::RCX),
                 1 => Location::GPR(GPR::RDX),
                 2 => Location::GPR(GPR::R8),
                 3 => Location::GPR(GPR::R9),
-                _ => Location::Memory(GPR::RBP, (16 + 32 + (idx - 4) * 8) as i32),
+                _ => Location::Memory(GPR::RBP, (32 + 16 + (idx - 4) * 8) as i32),
             },
             _ => match idx {
                 0 => Location::GPR(GPR::RDI),
@@ -1912,9 +1980,9 @@ impl Machine for MachineX86_64 {
                     self.assembler.emit_mov(size, source, dest);
                 }
                 Location::Memory(_, _) | Location::Memory2(_, _, _, _) => {
-                    self.assembler
-                        .emit_mov(size, source, Location::GPR(GPR::RAX));
-                    self.assembler.emit_mov(size, Location::GPR(GPR::RAX), dest);
+                    let tmp = self.pick_temp_gpr().unwrap();
+                    self.assembler.emit_mov(size, source, Location::GPR(tmp));
+                    self.assembler.emit_mov(size, Location::GPR(tmp), dest);
                 }
                 _ => unreachable!(),
             },
@@ -1923,9 +1991,9 @@ impl Machine for MachineX86_64 {
                     self.assembler.emit_mov(size, source, dest);
                 }
                 Location::Memory(_, _) | Location::Memory2(_, _, _, _) => {
-                    self.assembler
-                        .emit_mov(size, source, Location::GPR(GPR::RAX));
-                    self.assembler.emit_mov(size, Location::GPR(GPR::RAX), dest);
+                    let tmp = self.pick_temp_gpr().unwrap();
+                    self.assembler.emit_mov(size, source, Location::GPR(tmp));
+                    self.assembler.emit_mov(size, Location::GPR(tmp), dest);
                 }
                 _ => unreachable!(),
             },
@@ -1934,9 +2002,9 @@ impl Machine for MachineX86_64 {
                     self.assembler.emit_mov(size, source, dest);
                 }
                 Location::Memory(_, _) | Location::Memory2(_, _, _, _) => {
-                    self.assembler
-                        .emit_mov(size, source, Location::GPR(GPR::RAX));
-                    self.assembler.emit_mov(size, Location::GPR(GPR::RAX), dest);
+                    let tmp = self.pick_temp_gpr().unwrap();
+                    self.assembler.emit_mov(size, source, Location::GPR(tmp));
+                    self.assembler.emit_mov(size, Location::GPR(tmp), dest);
                 }
                 _ => unreachable!(),
             },
@@ -1955,21 +2023,35 @@ impl Machine for MachineX86_64 {
         size_op: Size,
         dest: Location,
     ) {
+        let dst = match dest {
+            Location::Memory(_, _) | Location::Memory2(_, _, _, _) => {
+                Location::GPR(self.acquire_temp_gpr().unwrap())
+            }
+            Location::GPR(_) | Location::SIMD(_) => dest,
+            _ => unreachable!(),
+        };
         match source {
             Location::GPR(_) | Location::Memory(_, _) | Location::Memory2(_, _, _, _) => {
                 match size_val {
-                    Size::S32 | Size::S64 => self.assembler.emit_mov(size_val, source, dest),
+                    Size::S32 | Size::S64 => self.assembler.emit_mov(size_val, source, dst),
                     Size::S16 | Size::S8 => {
                         if signed {
-                            self.assembler.emit_movsx(size_val, source, size_op, dest)
+                            self.assembler.emit_movsx(size_val, source, size_op, dst)
                         } else {
-                            self.assembler.emit_movzx(size_val, source, size_op, dest)
+                            self.assembler.emit_movzx(size_val, source, size_op, dst)
                         }
                     }
                 }
             }
             _ => unreachable!(),
         }
+        if dst != dest {
+            self.assembler.emit_mov(size_op, dst, dest);
+            match dst {
+                Location::GPR(x) => self.release_gpr(x),
+                _ => unreachable!(),
+            };
+        }
     }
     fn load_address(&mut self, size: Size, reg: Location, mem: Location) {
         match reg {
@@ -2147,6 +2229,10 @@ impl Machine for MachineX86_64 {
             .arch_emit_indirect_call_with_trampoline(location);
     }
 
+    fn emit_debug_breakpoint(&mut self) {
+        self.assembler.emit_bkpt();
+    }
+
     fn emit_call_location(&mut self, location: Location) {
         self.assembler.emit_call_location(location);
     }
@@ -2316,6 +2402,7 @@ impl Machine for MachineX86_64 {
         loc_b: Location,
         ret: Location,
         integer_division_by_zero: Label,
+        _integer_overflow: Label,
     ) -> usize {
         // We assume that RAX and RDX are temporary registers here.
         self.assembler
@@ -2338,6 +2425,7 @@ impl Machine for MachineX86_64 {
         loc_b: Location,
         ret: Location,
         integer_division_by_zero: Label,
+        _integer_overflow: Label,
     ) -> usize {
         // We assume that RAX and RDX are temporary registers here.
         self.assembler
@@ -2359,6 +2447,7 @@ impl Machine for MachineX86_64 {
         loc_b: Location,
         ret: Location,
         integer_division_by_zero: Label,
+        _integer_overflow: Label,
     ) -> usize {
         // We assume that RAX and RDX are temporary registers here.
         self.assembler
@@ -2381,6 +2470,7 @@ impl Machine for MachineX86_64 {
         loc_b: Location,
         ret: Location,
         integer_division_by_zero: Label,
+        _integer_overflow: Label,
     ) -> usize {
         // We assume that RAX and RDX are temporary registers here.
         let normal_path = self.assembler.get_label();
@@ -3753,6 +3843,7 @@ impl Machine for MachineX86_64 {
         loc_b: Location,
         ret: Location,
         integer_division_by_zero: Label,
+        _integer_overflow: Label,
     ) -> usize {
         // We assume that RAX and RDX are temporary registers here.
         self.assembler
@@ -3775,6 +3866,7 @@ impl Machine for MachineX86_64 {
         loc_b: Location,
         ret: Location,
         integer_division_by_zero: Label,
+        _integer_overflow: Label,
     ) -> usize {
         // We assume that RAX and RDX are temporary registers here.
         self.assembler
@@ -3796,6 +3888,7 @@ impl Machine for MachineX86_64 {
         loc_b: Location,
         ret: Location,
         integer_division_by_zero: Label,
+        _integer_overflow: Label,
     ) -> usize {
         // We assume that RAX and RDX are temporary registers here.
         self.assembler
@@ -3818,6 +3911,7 @@ impl Machine for MachineX86_64 {
         loc_b: Location,
         ret: Location,
         integer_division_by_zero: Label,
+        _integer_overflow: Label,
     ) -> usize {
         // We assume that RAX and RDX are temporary registers here.
         let normal_path = self.assembler.get_label();
@@ -6534,7 +6628,9 @@ impl Machine for MachineX86_64 {
         // Calculate stack offset.
         let mut stack_offset: u32 = 0;
         for (i, _param) in sig.params().iter().enumerate() {
-            if let Location::Memory(_, _) = self.get_param_location(1 + i, calling_convention) {
+            if let Location::Memory(_, _) =
+                self.get_simple_param_location(1 + i, calling_convention)
+            {
                 stack_offset += 8;
             }
         }
@@ -6562,12 +6658,12 @@ impl Machine for MachineX86_64 {
         // Arguments
         a.emit_mov(
             Size::S64,
-            self.get_param_location(1, calling_convention),
+            self.get_simple_param_location(1, calling_convention),
             Location::GPR(GPR::R15),
         ); // func_ptr
         a.emit_mov(
             Size::S64,
-            self.get_param_location(2, calling_convention),
+            self.get_simple_param_location(2, calling_convention),
             Location::GPR(GPR::R14),
         ); // args_rets
 
@@ -6577,7 +6673,7 @@ impl Machine for MachineX86_64 {
             let mut n_stack_args: usize = 0;
             for (i, _param) in sig.params().iter().enumerate() {
                 let src_loc = Location::Memory(GPR::R14, (i * 16) as _); // args_rets[i]
-                let dst_loc = self.get_param_location(1 + i, calling_convention);
+                let dst_loc = self.get_simple_param_location(1 + i, calling_convention);
 
                 match dst_loc {
                     Location::GPR(_) => {
@@ -6922,42 +7018,3 @@ impl Machine for MachineX86_64 {
         }
     }
 }
-
-// Constants for the bounds of truncation operations. These are the least or
-// greatest exact floats in either f32 or f64 representation less-than (for
-// least) or greater-than (for greatest) the i32 or i64 or u32 or u64
-// min (for least) or max (for greatest), when rounding towards zero.
-
-/// Greatest Exact Float (32 bits) less-than i32::MIN when rounding towards zero.
-const GEF32_LT_I32_MIN: f32 = -2147483904.0;
-/// Least Exact Float (32 bits) greater-than i32::MAX when rounding towards zero.
-const LEF32_GT_I32_MAX: f32 = 2147483648.0;
-/// Greatest Exact Float (32 bits) less-than i64::MIN when rounding towards zero.
-const GEF32_LT_I64_MIN: f32 = -9223373136366403584.0;
-/// Least Exact Float (32 bits) greater-than i64::MAX when rounding towards zero.
-const LEF32_GT_I64_MAX: f32 = 9223372036854775808.0;
-/// Greatest Exact Float (32 bits) less-than u32::MIN when rounding towards zero.
-const GEF32_LT_U32_MIN: f32 = -1.0;
-/// Least Exact Float (32 bits) greater-than u32::MAX when rounding towards zero.
-const LEF32_GT_U32_MAX: f32 = 4294967296.0;
-/// Greatest Exact Float (32 bits) less-than u64::MIN when rounding towards zero.
-const GEF32_LT_U64_MIN: f32 = -1.0;
-/// Least Exact Float (32 bits) greater-than u64::MAX when rounding towards zero.
-const LEF32_GT_U64_MAX: f32 = 18446744073709551616.0;
-
-/// Greatest Exact Float (64 bits) less-than i32::MIN when rounding towards zero.
-const GEF64_LT_I32_MIN: f64 = -2147483649.0;
-/// Least Exact Float (64 bits) greater-than i32::MAX when rounding towards zero.
-const LEF64_GT_I32_MAX: f64 = 2147483648.0;
-/// Greatest Exact Float (64 bits) less-than i64::MIN when rounding towards zero.
-const GEF64_LT_I64_MIN: f64 = -9223372036854777856.0;
-/// Least Exact Float (64 bits) greater-than i64::MAX when rounding towards zero.
-const LEF64_GT_I64_MAX: f64 = 9223372036854775808.0;
-/// Greatest Exact Float (64 bits) less-than u32::MIN when rounding towards zero.
-const GEF64_LT_U32_MIN: f64 = -1.0;
-/// Least Exact Float (64 bits) greater-than u32::MAX when rounding towards zero.
-const LEF64_GT_U32_MAX: f64 = 4294967296.0;
-/// Greatest Exact Float (64 bits) less-than u64::MIN when rounding towards zero.
-const GEF64_LT_U64_MIN: f64 = -1.0;
-/// Least Exact Float (64 bits) greater-than u64::MAX when rounding towards zero.
-const LEF64_GT_U64_MAX: f64 = 18446744073709551616.0;
diff --git a/lib/compiler-singlepass/src/x64_decl.rs b/lib/compiler-singlepass/src/x64_decl.rs
index 96590527562..14f63ff5fe2 100644
--- a/lib/compiler-singlepass/src/x64_decl.rs
+++ b/lib/compiler-singlepass/src/x64_decl.rs
@@ -166,35 +166,6 @@ impl CombinedRegister for X64Register {
             _ => return None,
         })
     }
-
-    /// Returns the instruction prefix for `movq %this_reg, ?(%rsp)`.
-    ///
-    /// To build an instruction, append the memory location as a 32-bit
-    /// offset to the stack pointer to this prefix.
-    fn _prefix_mov_to_stack(&self) -> Option<&'static [u8]> {
-        Some(match *self {
-            X64Register::GPR(gpr) => match gpr {
-                GPR::RDI => &[0x48, 0x89, 0xbc, 0x24],
-                GPR::RSI => &[0x48, 0x89, 0xb4, 0x24],
-                GPR::RDX => &[0x48, 0x89, 0x94, 0x24],
-                GPR::RCX => &[0x48, 0x89, 0x8c, 0x24],
-                GPR::R8 => &[0x4c, 0x89, 0x84, 0x24],
-                GPR::R9 => &[0x4c, 0x89, 0x8c, 0x24],
-                _ => return None,
-            },
-            X64Register::XMM(xmm) => match xmm {
-                XMM::XMM0 => &[0x66, 0x0f, 0xd6, 0x84, 0x24],
-                XMM::XMM1 => &[0x66, 0x0f, 0xd6, 0x8c, 0x24],
-                XMM::XMM2 => &[0x66, 0x0f, 0xd6, 0x94, 0x24],
-                XMM::XMM3 => &[0x66, 0x0f, 0xd6, 0x9c, 0x24],
-                XMM::XMM4 => &[0x66, 0x0f, 0xd6, 0xa4, 0x24],
-                XMM::XMM5 => &[0x66, 0x0f, 0xd6, 0xac, 0x24],
-                XMM::XMM6 => &[0x66, 0x0f, 0xd6, 0xb4, 0x24],
-                XMM::XMM7 => &[0x66, 0x0f, 0xd6, 0xbc, 0x24],
-                _ => return None,
-            },
-        })
-    }
 }
 
 /// An allocator that allocates registers for function arguments according to the System V ABI.
diff --git a/tests/ignores.txt b/tests/ignores.txt
index 5e1dd68bfee..bb1eed19183 100644
--- a/tests/ignores.txt
+++ b/tests/ignores.txt
@@ -68,6 +68,10 @@ cranelift spec::simd::simd_int_to_int_extend
 # Windows doesn't overcommit and fails to allocate 4GB of memory
 windows wasmer::max_size_of_memory
 
+# Some AARCH64 CPU have issue with segfault writin 64bits on border page, where the 1 32bits might be written.
+aarch64+linux spec::align
+aarch64+linux spec::memory_trap
+
 # Frontends
 
 ## WASI